From 4ee1c9e60d278a5172c18549bfebbbe533fdfade Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 19 Mar 2018 19:07:57 -0700
Subject: [PATCH 01/12] "add sequence expand kernel"

---
 paddle/fluid/operators/sequence_expand_op.cu | 52 +++++++++++++++++++
 paddle/fluid/operators/sequence_expand_op.h  | 53 +++++++++++++-------
 2 files changed, 86 insertions(+), 19 deletions(-)
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 26622d23afa1c..6477af89f110a 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -15,6 +15,58 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sequence_expand_op.h"
 
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod,
+                                  size_t element_len) {
+  int BLOCK_SIZE = 1024;
+  __shared__ T shm_lod[BLOCK_SIZE];
+  for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) {
+    shm_lod[idx] = lod[idx];
+  }
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size();
+       idx += blockDim.x * gridDim.x) {
+    int scale = lod[i]
+  }
+}
+
+template <typename T>
+void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
+    const platform::CPUDeviceContext& context, const LoDTensor& x,
+    LoDTensor* out) {
+  x_dims = x.dims();
+  size_t element_len = framework::product(x_dims) / x_dims[0];
+  T* out_data = out->mutable_data<T>(context.GetPlace());
+  auto out_starts = out->lod().back();
+
+  const int kThreadsPerBlock = 1024;
+  int block_cols = kThreadsPerBlock;
+  if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((out_cols + 31) >> 5) << 5;
+  }
+  int block_rows = kThreadsPerBlock / block_cols;
+  dim3 block_size = dim3(block_cols, block_rows, 1);
+
+  int max_threads = context.GetMaxPhysicalThreadCount();
+  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+  int grid_cols =
+      std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+  int grid_rows =
+      std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+  sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+      x.data<T>(), out->mutable_data<T>(context.GetPlace()),
+      out_starts.CUDAData(context.GetPlace()), element_len);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand,
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 76dde976db2d1..12e4018b95bbf 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -16,13 +16,44 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
+};
+
+// template <typename DeviceContext, typename T>
+// struct SequenceExpandGradFunctor {};
+
+template <typename T>
+void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
+    const platform::CPUDeviceContext& context, const LoDTensor& x,
+    LoDTensor* out) {
+  x_dims = x.dims();
+  size_t element_len = framework::product(x_dims) / x_dims[0];
+  T* out_data = out->mutable_data<T>(context.GetPlace());
+  auto out_starts = out->lod().back();
+
+  for (size_t i = 0; i < out_starts.size() - 1; i++) {
+    int scale = out_starts[i + 1] - out_starts[i];
+    Eigen::TensorMap<
+        Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+        x_t(x_data, 1, element_len);
+    Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+        out_t(out_data, scale, element_len);
+    Eigen::array<int, 2> cast({{scale, 1}});
+    out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
+    x_data += element_len;
+    out_data += element_len * scale;
+  }
+}
+
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
@@ -38,24 +69,8 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
                       "The size of last lod level in Input(Y)"
                       "must be equal to dims[0] of Input(X).");
     out->set_lod(y->lod());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*place) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
-    }
+    SequenceExpandFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, out);
   }
 };
 

From 26822bd774a99d19d5bb37f4890e82aacd57c391 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Tue, 20 Mar 2018 04:04:58 -0700
Subject: [PATCH 02/12] "add sequence kernel"

---
 paddle/fluid/operators/sequence_expand_op.cu | 107 +++++++++++++------
 paddle/fluid/operators/sequence_expand_op.h  |  86 ++++++++-------
 2 files changed, 123 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 6477af89f110a..9cdb89f8fd8c8 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -21,48 +21,89 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
-__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod,
-                                  size_t element_len) {
-  int BLOCK_SIZE = 1024;
-  __shared__ T shm_lod[BLOCK_SIZE];
-  for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) {
-    shm_lod[idx] = lod[idx];
+__global__ void sequence_expand_kernel(const T* x_data, T* out_data,
+                                       const size_t* lod, size_t lod_size,
+                                       size_t element_len) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < static_cast<int>(lod_size - 1);
+       tid_x += blockDim.x * gridDim.x) {
+    int scale = lod[tid_x + 1] - lod[tid_x];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
+      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
+      int item_start = tid_x / element_len;
+      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
+        out_data[item_start * scale + tid_z] = x_data[item_start + tid_z];
+      }
+    }
   }
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size();
-       idx += blockDim.x * gridDim.x) {
-    int scale = lod[i]
+}
+
+template <typename T>
+__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
+                                            const size_t* lod, size_t lod_size,
+                                            size_t element_len,
+                                            size_t dout_size) {
+  extern __shared__ T shm[];
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < static_cast<int>(lod_size - 1);
+       tid_x += blockDim.x * gridDim.x) {
+    int scale = lod[tid_x + 1] - lod[tid_x];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
+      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
+      int item_start = tid_x / element_len;
+      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
+        shm[item_start + tid_z] += doutx_data[item_start * scale + tid_z];
+      }
+    }
+  }
+  // synchronize before write to dx
+  __syncthreads();
+  for (int idx = blockDimx * blockIdx.x + threadIdx.x;
+       idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) {
+    dx_data[idx] = shm[idx;]
   }
 }
 
 template <typename T>
-void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
-    const platform::CPUDeviceContext& context, const LoDTensor& x,
-    LoDTensor* out) {
-  x_dims = x.dims();
-  size_t element_len = framework::product(x_dims) / x_dims[0];
-  T* out_data = out->mutable_data<T>(context.GetPlace());
-  auto out_starts = out->lod().back();
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& x, LoDTensor* out) {
+    auto x_dims = x.dims();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
 
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((out_cols + 31) >> 5) << 5;
+    dim3 block_size(16, 32, element_len);
+    dim3 grid_size(10, 10);
+    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), out->mutable_data<T>(context.GetPlace()),
+        out_starts.CUDAData(context.GetPlace()), out_starts.size(),
+        element_len);
   }
-  int block_rows = kThreadsPerBlock / block_cols;
-  dim3 block_size = dim3(block_cols, block_rows, 1);
+};
 
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const LoDTensor& x,
+                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
+    auto x_dims = x.dims();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
 
-  int grid_cols =
-      std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
-  dim3 grid_size = dim3(grid_cols, grid_rows, 1);
-  sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-      x.data<T>(), out->mutable_data<T>(context.GetPlace()),
-      out_starts.CUDAData(context.GetPlace()), element_len);
-}
+    dim3 block_size(16, 32, element_len);
+    dim3 grid_size(10, 10);
+    size_t out_size = framework::product(dx->dims());
+    sequence_expand_kernel<<<grid_size, block_size, out_size * sizeof(T),
+                             context.stream()>>>(
+        dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
+        out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len,
+        out_size);
+  }
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 12e4018b95bbf..3b66bf3d8cbdc 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -28,31 +28,36 @@ struct SequenceExpandFunctor {
   void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
 };
 
-// template <typename DeviceContext, typename T>
-// struct SequenceExpandGradFunctor {};
+template <typename DeviceContext, typename T>
+struct SequenceExpandGradFunctor {
+  void operator()(const DeviceContext& ctx, const LoDTensor& x,
+                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx);
+};
 
 template <typename T>
-void SequenceExpandFunctor<platform::CPUDeviceContext, T>::operator()(
-    const platform::CPUDeviceContext& context, const LoDTensor& x,
-    LoDTensor* out) {
-  x_dims = x.dims();
-  size_t element_len = framework::product(x_dims) / x_dims[0];
-  T* out_data = out->mutable_data<T>(context.GetPlace());
-  auto out_starts = out->lod().back();
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
+                  LoDTensor* out) {
+    auto x_dims = x.dims();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
 
-  for (size_t i = 0; i < out_starts.size() - 1; i++) {
-    int scale = out_starts[i + 1] - out_starts[i];
-    Eigen::TensorMap<
-        Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-        x_t(x_data, 1, element_len);
-    Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-        out_t(out_data, scale, element_len);
-    Eigen::array<int, 2> cast({{scale, 1}});
-    out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
-    x_data += element_len;
-    out_data += element_len * scale;
+    for (size_t i = 0; i < out_starts.size() - 1; i++) {
+      int scale = out_starts[i + 1] - out_starts[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          x_t(x_data, 1, element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          out_t(out_data, scale, element_len);
+      Eigen::array<int, 2> cast({{scale, 1}});
+      out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
+      x_data += element_len;
+      out_data += element_len * scale;
+    }
   }
-}
+};
 
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
@@ -60,7 +65,6 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
-    const T* x_data = x->data<T>();
     auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
     PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
@@ -86,19 +90,14 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
-template <typename DeviceContext, typename T>
-class SequenceExpandGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_last_level = out->lod().back();
-    d_x->set_lod(x->lod());
-    const T* d_out_data = d_out->data<T>();
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const LoDTensor& x,
+                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
+    auto out_last_level = out.lod().back();
+    const T* d_out_data = d_out.data<T>();
     T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out->numel() / d_out->dims()[0];
+    size_t element_len = d_out.numel() / d_out.dims()[0];
     for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
       size_t repeat = out_last_level[i + 1] - out_last_level[i];
       Eigen::TensorMap<
@@ -106,14 +105,27 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
       d_out_t(d_out_data, static_cast<int>(repeat), element_len);
       Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
       d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place =
-          context.template device_context<DeviceContext>().eigen_device();
-      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      d_x_t.device(*context.eigen_device()) =
+          d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (repeat * element_len);
       d_x_data += element_len;
     }
   }
 };
 
+template <typename DeviceContext, typename T>
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Input<LoDTensor>("Out");
+    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    d_x->set_lod(x->lod());
+    SequenceExpandGradFunctor(context.template device_context(), *x, *out,
+                              d_out, d_x);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

From e4c35d837d79c4b1a4f30e42efe143f64ec10e71 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Tue, 20 Mar 2018 04:43:00 -0700
Subject: [PATCH 03/12] "add details"

---
 paddle/fluid/operators/sequence_expand_op.cu | 19 +++++++++----------
 paddle/fluid/operators/sequence_expand_op.h  | 18 ++++++++++--------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 9cdb89f8fd8c8..cae0a6928455b 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -54,15 +54,15 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
       int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
       int item_start = tid_x / element_len;
       for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
-        shm[item_start + tid_z] += doutx_data[item_start * scale + tid_z];
+        shm[item_start + tid_z] += dout_data[item_start * scale + tid_z];
       }
     }
   }
   // synchronize before write to dx
   __syncthreads();
-  for (int idx = blockDimx * blockIdx.x + threadIdx.x;
+  for (int idx = blockDim.x * blockIdx.x + threadIdx.x;
        idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) {
-    dx_data[idx] = shm[idx;]
+    dx_data[idx] = shm[idx];
   }
 }
 
@@ -86,19 +86,18 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
 
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const LoDTensor& x,
-                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& x, const LoDTensor& out,
+                  const LoDTensor& dout, LoDTensor* dx) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
+    auto out_starts = out.lod().back();
 
     dim3 block_size(16, 32, element_len);
     dim3 grid_size(10, 10);
     size_t out_size = framework::product(dx->dims());
-    sequence_expand_kernel<<<grid_size, block_size, out_size * sizeof(T),
-                             context.stream()>>>(
+    sequence_expand_grad_kernel<<<grid_size, block_size, out_size * sizeof(T),
+                                  context.stream()>>>(
         dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
         out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len,
         out_size);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 3b66bf3d8cbdc..11890b30ae598 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -40,7 +40,7 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
                   LoDTensor* out) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    const T* x_data = x->data<T>();
+    const T* x_data = x.data<T>();
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto out_starts = out->lod().back();
 
@@ -92,12 +92,12 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  * */
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const LoDTensor& x,
+  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
                   const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
     auto out_last_level = out.lod().back();
-    const T* d_out_data = d_out.data<T>();
-    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out.numel() / d_out.dims()[0];
+    const T* d_out_data = dout.data<T>();
+    T* d_x_data = dx->mutable_data<T>(context.GetPlace());
+    size_t element_len = dout.numel() / dout.dims()[0];
     for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
       size_t repeat = out_last_level[i + 1] - out_last_level[i];
       Eigen::TensorMap<
@@ -117,13 +117,15 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
     auto* out = context.Input<LoDTensor>("Out");
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+
     auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
     d_x->set_lod(x->lod());
-    SequenceExpandGradFunctor(context.template device_context(), *x, *out,
-                              d_out, d_x);
+    SequenceExpandGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, *out, *d_out,
+            d_x);
   }
 };
 

From 53c8c36a04f92685f3fc380cbc41b9af1031de67 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 21 Mar 2018 05:49:53 -0700
Subject: [PATCH 04/12] "debug the process"

---
 paddle/fluid/framework/executor.cc            |   2 +-
 paddle/fluid/operators/sequence_expand_op.cu  | 128 ++++++++++++------
 .../paddle/fluid/tests/unittests/op_test.py   |   3 +
 .../tests/unittests/test_sequence_expand.py   |  88 ++++++------
 4 files changed, 133 insertions(+), 88 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 7155d5ef2febc..5125072ddd3d5 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -44,7 +44,7 @@ struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
       : prog_(prog), block_id_(block_id) {}
 
-  const framework::ProgramDesc& prog_;
+  const framework::ProgramDesc prog_;
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 };
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index cae0a6928455b..bf453ca7e8ea3 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
+#include <stdio.h>
+#include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -22,47 +25,71 @@ using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 __global__ void sequence_expand_kernel(const T* x_data, T* out_data,
-                                       const size_t* lod, size_t lod_size,
-                                       size_t element_len) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < static_cast<int>(lod_size - 1);
-       tid_x += blockDim.x * gridDim.x) {
-    int scale = lod[tid_x + 1] - lod[tid_x];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
-      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
-      int item_start = tid_x / element_len;
-      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
-        out_data[item_start * scale + tid_z] = x_data[item_start + tid_z];
-      }
+                                       const size_t* lod,
+                                       const size_t* out_offset,
+                                       size_t lod_size, size_t element_len,
+                                       size_t x_size) {
+  int bid_x = blockIdx.x;
+  if (bid_x > lod_size) return;
+  int repeats = lod[bid_x];
+  int offset = out_offset[bid_x];
+  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
+    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
+      out_data[(offset + tid_y) * element_len + tid_x] =
+          x_data[bid_x * element_len + tid_x];
     }
   }
 }
 
 template <typename T>
 __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
-                                            const size_t* lod, size_t lod_size,
-                                            size_t element_len,
-                                            size_t dout_size) {
+                                            const size_t* lod,
+                                            const size_t* out_offset,
+                                            size_t lod_size, size_t element_len,
+                                            size_t dout_size, size_t dx_size) {
+  // reduce visit memory time.
+  // dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
+  if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
+      threadIdx.y == 0) {
+    printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n",
+           lod_size, element_len, dout_size, dx_size);
+  }
   extern __shared__ T shm[];
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < static_cast<int>(lod_size - 1);
-       tid_x += blockDim.x * gridDim.x) {
-    int scale = lod[tid_x + 1] - lod[tid_x];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) {
-      int tid_z = blockIdx.z * blockDim.z + threadIdx.z;
-      int item_start = tid_x / element_len;
-      for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) {
-        shm[item_start + tid_z] += dout_data[item_start * scale + tid_z];
-      }
+  T* dout_shm = shm;
+  T* dx_shm = &shm[dout_size];
+
+  // int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int idx = 0; idx < dout_size; ++idx) {
+    if (idx < dx_size) {
+      dx_shm[idx] = 0.0;
+    }
+    if (idx < dout_size) {
+      dout_shm[idx] = dout_data[idx];
+    }
+  }
+
+  int bid_x = blockIdx.x;
+  if (bid_x > lod_size) return;
+  int repeats = lod[bid_x];
+  int offset = out_offset[bid_x];
+  if (threadIdx.x == 0) {
+    printf("repeats=%d, offset=%ld\n", repeats, offset);
+  }
+  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
+    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
+      T val = dout_shm[(offset + tid_y) * element_len + tid_x];
+      platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val);
+      int dx_idx = bid_x * element_len + tid_x;
+      int dout_idx = (offset + tid_y) * element_len + tid_x;
+      printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n",
+             dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val);
     }
   }
-  // synchronize before write to dx
   __syncthreads();
-  for (int idx = blockDim.x * blockIdx.x + threadIdx.x;
-       idx < static_cast<int>(dout_size); idx += blockDim.x * gridDim.x) {
-    dx_data[idx] = shm[idx];
+  // copy shared memory back to dx
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size;
+       idx += blockDim.x * gridDim.x) {
+    dx_data[idx] = dx_shm[idx];
   }
 }
 
@@ -72,15 +99,20 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
                   const LoDTensor& x, LoDTensor* out) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
+    auto lod = out->lod().back();
+    framework::Vector<size_t> out_lod;
+    for (size_t i = 0; i < lod.size() - 1; ++i) {
+      out_lod.push_back(lod[i + 1] - lod[i]);
+    }
 
-    dim3 block_size(16, 32, element_len);
-    dim3 grid_size(10, 10);
+    int thread_x = std::max(static_cast<int>(element_len), 32);
+    int block_x = static_cast<int>(out_lod.size());
+    dim3 block_size(thread_x, 1024 / thread_x);
+    dim3 grid_size(block_x, 1);
     sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
         x.data<T>(), out->mutable_data<T>(context.GetPlace()),
-        out_starts.CUDAData(context.GetPlace()), out_starts.size(),
-        element_len);
+        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
+        out_lod.size(), element_len, framework::product(x_dims));
   }
 };
 
@@ -91,16 +123,24 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const LoDTensor& dout, LoDTensor* dx) {
     auto x_dims = x.dims();
     size_t element_len = framework::product(x_dims) / x_dims[0];
-    auto out_starts = out.lod().back();
+    auto lod = out.lod().back();
+    framework::Vector<size_t> out_lod;
+    for (size_t i = 0; i < lod.size() - 1; ++i) {
+      out_lod.push_back(lod[i + 1] - lod[i]);
+    }
+    size_t dout_size = framework::product(dout.dims());
+    size_t dx_size = framework::product(dx->dims());
 
-    dim3 block_size(16, 32, element_len);
-    dim3 grid_size(10, 10);
-    size_t out_size = framework::product(dx->dims());
-    sequence_expand_grad_kernel<<<grid_size, block_size, out_size * sizeof(T),
+    int thread_x = std::max(static_cast<int>(element_len), 32);
+    dim3 block_size(thread_x, 1024 / thread_x);
+    int block_x = static_cast<int>(out_lod.size());
+    dim3 grid_size(block_x, 1);
+    sequence_expand_grad_kernel<<<grid_size, block_size,
+                                  (dout_size + dx_size) * sizeof(T),
                                   context.stream()>>>(
         dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
-        out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len,
-        out_size);
+        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
+        out_lod.size(), element_len, dout_size, dx_size);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b1c7..555f188abb9b8 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -362,6 +362,9 @@ def __assert_is_close(self, numeric_grads, analytic_grads, names,
         for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
+            print("actual", a)
+            print("*****")
+            print("expected", b)
 
             diff_mat = np.abs(a - b) / abs_a
             max_diff = np.max(diff_mat)
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 957fa5d2c4a79..f984127b4d64f 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -19,8 +19,14 @@
 
 class TestSequenceExpand(OpTest):
     def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        x = [i / 10.0 for i in range(3)]
+        y = [i / 10.0 for i in range(8)]
+        x_data = np.array(x).reshape(3, 1).astype('float32')
+        y_data = np.array(y).reshape(8, 1).astype('float32')
+        print(x_data)
+        print(y_data)
+        # x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        # y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
         y_lod = [[0, 1, 4, 8]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
@@ -45,47 +51,43 @@ def test_check_output(self):
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
-
-class TestSequenceExpandCase1(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase2(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[0, 1]]
-        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase3(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[0, 1, 2, 3, 4]]
-        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 4, 6]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
-
-
-class TestSequenceExpandCase4(TestSequenceExpand):
-    def set_data(self):
-        x_data = np.array(
-            [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
-                [2, 5]).astype('float32')
-        x_lod = [[
-            0,
-            1,
-            2,
-        ]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+    # class TestSequenceExpandCase1(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+    #         x_lod = [[0, 2, 5]]
+    #         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
+    #         y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+    # class TestSequenceExpandCase2(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
+    #         x_lod = [[0, 1]]
+    #         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
+    #         y_lod = [[0, 2]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+    # class TestSequenceExpandCase3(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+    #         x_lod = [[0, 1, 2, 3, 4]]
+    #         y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
+    #         y_lod = [[0, 2, 4, 4, 6]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+    # class TestSequenceExpandCase4(TestSequenceExpand):
+    #     def set_data(self):
+    #         x_data = np.array(
+    #             [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
+    #                 [2, 5]).astype('float32')
+    #         x_lod = [[
+    #             0,
+    #             1,
+    #             2,
+    #         ]]
+    #         y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
+    #         y_lod = [[0, 1, 2], [0, 1, 2]]
+    #         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
 if __name__ == '__main__':

From db1b128feb63a14514c2e38e344f6b464e1b7a68 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 26 Mar 2018 20:16:57 -0700
Subject: [PATCH 05/12] "add details"

---
 paddle/fluid/operators/sequence_expand_op.h | 161 ++++++++++++++------
 1 file changed, 114 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 11890b30ae598..5cab367988dd2 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,15 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <numeric>  // std::itoa
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandFunctor {
@@ -38,23 +42,35 @@ template <typename T>
 struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
                   LoDTensor* out) {
-    auto x_dims = x.dims();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    const T* x_data = x.data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*context.eigen_device()) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
+    auto& out_lod = out->lod()[0];
+    framework::Vector<size_t> x_lod;
+    if (x.lod() == 1) {
+      x_lod = x.lod()[0];
+    } else {
+      x_lod.reserve(out_lod.size());
+      std::itoa(x_lod.begin(), x_lod.end(), 0);  // fill 0 ~ out_lod.size()-1
+    }
+    int out_offset = 0;
+    auto& eigen_place = *context.eigen_device();
+    for (size_t i = 1; i < out_lod.size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      int x_start = x_lod[i - 1];
+      int x_end = x_lod[i];
+      int x_seq_len = x_end - x_start;
+      if (repeat_num > 0) {
+        auto x_sub_tensor = x->Slice(x_start, x_end);
+        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
+        int out_start = out_offset;
+        if (x_lod.size() == 1) {
+          out_start = out_lod[0][out_offset];
+        }
+        auto out_sub_tensor =
+            out->Slice(out_start, out_start + x_seq_len * repeat_num);
+        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
+        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
+            EigenMatrix<T>::From(x_sub_tensor)
+                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+      }
     }
   }
 };
@@ -64,15 +80,42 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
-    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
-                      y->lod().back().size() - 1,
-                      "The size of last lod level in Input(Y)"
-                      "must be equal to dims[0] of Input(X).");
-    out->set_lod(y->lod());
+    auto* out = context.Output<LoDTensor>("Out");
+
+    int ref_level = context.Attr<int>("ref_level");
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    out->mutable_data<T>(context.GetPlace());
+
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*x, context.GetPlace(), out);
+      return;
+    }
+
+    auto& out_lod = *out->mutable_lod();
+    // x lod level is at most 1.
+    if (x_lod.size() == 0) {
+      out_lod = y_lod[ref_level];
+    } else if (x_lod.size() == 1) {
+      out_lod.resize(1);
+      out_lod[0] = {0};
+      int out_offset = 0;
+      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+        int x_start = x_lod[0][i - 1];
+        int x_end = x_lod[0][i];
+        int x_seq_len = x_end - x_start;
+        for (int j = 0; j < repeat_num; ++j) {
+          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
+          out_offset++;
+        }
+      }
+    }
+
     SequenceExpandFunctor<DeviceContext, T> functor;
     functor(context.template device_context<DeviceContext>(), *x, out);
   }
@@ -94,21 +137,31 @@ template <typename T>
 struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
                   const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
-    auto out_last_level = out.lod().back();
-    const T* d_out_data = dout.data<T>();
-    T* d_x_data = dx->mutable_data<T>(context.GetPlace());
-    size_t element_len = dout.numel() / dout.dims()[0];
-    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
-      size_t repeat = out_last_level[i + 1] - out_last_level[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_x_t(d_x_data, static_cast<int>(element_len));
-      d_x_t.device(*context.eigen_device()) =
-          d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (repeat * element_len);
-      d_x_data += element_len;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
+    int g_out_offset = 0;
+    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      if (repeat_num > 0) {
+        int x_start = i - 1;
+        int x_end = i;
+        if (x_lod.size() == 1) {
+          x_start = x_lod[0][i - 1];
+          x_end = x_lod[0][i];
+        }
+        int x_seq_len = x_end - x_start;
+        auto g_x_sub = g_x->Slice(x_start, x_end);
+        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
+        int g_out_end = g_out_offset + repeat_num * x_seq_len;
+        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
+        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
+        math::ColwiseSum<DeviceContext, T> col_sum;
+        col_sum(dev_ctx, g_out_sub, &g_x_sub);
+        g_out_offset += repeat_num * x_seq_len;
+      }
     }
   }
 };
@@ -117,15 +170,29 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* y = context.Input<LoDTensor>("Y");
+    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int ref_level = context.Attr<int>("ref_level");
+
+    g_x->mutable_data<T>(context.GetPlace());
+    g_x->set_lod(x->lod());
+
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    // just copy the gradient
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
+      return;
+    }
 
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    d_x->set_lod(x->lod());
     SequenceExpandGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, *out, *d_out,
-            d_x);
+    functor(context.template device_context<DeviceContext>(), *x, *y, *g_out,
+            g_x);
   }
 };
 

From 0be1e09f2c703c1479259ab68b06cc4bd1cb5c43 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 28 Mar 2018 02:34:34 -0700
Subject: [PATCH 06/12] "fix ci"

---
 paddle/fluid/operators/sequence_expand_op.cc  |   5 +-
 paddle/fluid/operators/sequence_expand_op.cu  | 193 +++++++++---------
 paddle/fluid/operators/sequence_expand_op.h   | 130 ++++++------
 .../tests/unittests/test_sequence_expand.py   |  22 +-
 4 files changed, 183 insertions(+), 167 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index 786fe63e7580c..ae52849162ae4 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -84,12 +84,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
         }
       }
       out_dims[0] = out_first_dim;
-      ctx->SetOutputDim("Out", out_dims);
     } else {
       out_dims[0] = -1;
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
     }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 743e3bbc297c5..1bd73426522bc 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -24,123 +24,128 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
-__global__ void sequence_expand_kernel(const T* x_data, T* out_data,
-                                       const size_t* lod,
-                                       const size_t* out_offset,
-                                       size_t lod_size, size_t element_len,
-                                       size_t x_size) {
-  int bid_x = blockIdx.x;
-  if (bid_x > lod_size) return;
-  int repeats = lod[bid_x];
-  int offset = out_offset[bid_x];
-  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
-    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
-      out_data[(offset + tid_y) * element_len + tid_x] =
-          x_data[bid_x * element_len + tid_x];
+__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
+                                       const size_t* ref_lod,
+                                       const size_t lod_size,
+                                       /* default=1,
+                                          the instance length*/
+                                       const int x_item_length, T* out_data) {
+  constexpr int N = 1024;
+  __shared__ int mem[N];
+  int offset = 0;
+  for (int i = 0; i < lod_size; ++i) {
+    mem[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
     }
   }
-}
+  __syncthreads();
 
-template <typename T>
-__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data,
-                                            const size_t* lod,
-                                            const size_t* out_offset,
-                                            size_t lod_size, size_t element_len,
-                                            size_t dout_size, size_t dx_size) {
-  // reduce visit memory time.
-  // dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1]
-  if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 &&
-      threadIdx.y == 0) {
-    printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n",
-           lod_size, element_len, dout_size, dx_size);
-  }
-  extern __shared__ T shm[];
-  T* dout_shm = shm;
-  T* dx_shm = &shm[dout_size];
-
-  // int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int idx = 0; idx < dout_size; ++idx) {
-    if (idx < dx_size) {
-      dx_shm[idx] = 0.0;
-    }
-    if (idx < dout_size) {
-      dout_shm[idx] = dout_data[idx];
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+
+  int x_item_count = x_lod[bid + 1] - x_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = mem[bid];
+  int x_offset = x_lod[bid];
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
+                 tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
+      }
     }
   }
+}
 
-  int bid_x = blockIdx.x;
-  if (bid_x > lod_size) return;
-  int repeats = lod[bid_x];
-  int offset = out_offset[bid_x];
-  if (threadIdx.x == 0) {
-    printf("repeats=%d, offset=%ld\n", repeats, offset);
-  }
-  for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) {
-    for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) {
-      T val = dout_shm[(offset + tid_y) * element_len + tid_x];
-      platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val);
-      int dx_idx = bid_x * element_len + tid_x;
-      int dout_idx = (offset + tid_y) * element_len + tid_x;
-      printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n",
-             dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val);
+template <typename T>
+__global__ void sequence_expand_grad_kernel(const T* dout_data,
+                                            const size_t* ref_lod,
+                                            const size_t* dx_lod,
+                                            const size_t lod_size,
+                                            /* default=1,
+                                               the instance length*/
+                                            const int x_item_length,
+                                            T* dx_data) {
+  // TODO(dzhwinter) : too many atomicAdd
+  // use shared memory to reduce memory visits
+  constexpr int N = 1024;
+  __shared__ int mem[N];
+  int offset = 0;
+  for (int i = 0; i < lod_size; ++i) {
+    mem[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]);
     }
   }
   __syncthreads();
-  // copy shared memory back to dx
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size;
-       idx += blockDim.x * gridDim.x) {
-    dx_data[idx] = dx_shm[idx];
+
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = mem[bid];
+  int x_offset = dx_lod[bid];
+
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        platform::CudaAtomicAdd(
+            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
+            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
+                          x_item_length +
+                      tid_x]);
+      }
+    }
   }
 }
 
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const LoDTensor& x, LoDTensor* out) {
-    auto x_dims = x.dims();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    auto lod = out->lod().back();
-    framework::Vector<size_t> out_lod;
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      out_lod.push_back(lod[i + 1] - lod[i]);
-    }
-
-    int thread_x = std::max(static_cast<int>(element_len), 32);
-    int block_x = static_cast<int>(out_lod.size());
-    dim3 block_size(thread_x, 1024 / thread_x);
+  void operator()(
+      const platform::CUDADeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int x_item_length = 1;
+    x_item_length = x.numel() / x.dims()[0];
+    VLOG(0) << "x_item_length" << x_item_length;
+    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
+    int thread_y = std::max(1024 / thread_x, 16);
+    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
+
     sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), out->mutable_data<T>(context.GetPlace()),
-        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
-        out_lod.size(), element_len, framework::product(x_dims));
+        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
+        ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        out->mutable_data<T>(context.GetPlace()));
   }
 };
 
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
-                  const LoDTensor& x, const LoDTensor& out,
-                  const LoDTensor& dout, LoDTensor* dx) {
-    auto x_dims = x.dims();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    auto lod = out.lod().back();
-    framework::Vector<size_t> out_lod;
-    for (size_t i = 0; i < lod.size() - 1; ++i) {
-      out_lod.push_back(lod[i + 1] - lod[i]);
-    }
-    size_t dout_size = framework::product(dout.dims());
-    size_t dx_size = framework::product(dx->dims());
-
-    int thread_x = std::max(static_cast<int>(element_len), 32);
-    dim3 block_size(thread_x, 1024 / thread_x);
-    int block_x = static_cast<int>(out_lod.size());
+                  const LoDTensor& dout,
+                  const framework::Vector<size_t>& x_lod, /*expand source lod*/
+                  const framework::Vector<size_t>& ref_lod, /*expand based lod*/
+                  LoDTensor* dx) {
+    int x_item_length = 1;
+    x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+
+    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
+    int thread_y = std::max(1024 / thread_x, 16);
+    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
-    sequence_expand_grad_kernel<<<grid_size, block_size,
-                                  (dout_size + dx_size) * sizeof(T),
-                                  context.stream()>>>(
-        dout.data<T>(), dx->mutable_data<T>(context.GetPlace()),
-        out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()),
-        out_lod.size(), element_len, dout_size, dx_size);
+    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
+        x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 5cab367988dd2..c55c3e215abdf 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <numeric>  // std::itoa
+#include <numeric>  // std::iota
 
+#include <glog/logging.h>
+#include <sstream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -29,40 +31,42 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandFunctor {
-  void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out);
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 struct SequenceExpandGradFunctor {
-  void operator()(const DeviceContext& ctx, const LoDTensor& x,
-                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx);
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx);
 };
 
 template <typename T>
 struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
-                  LoDTensor* out) {
-    auto& out_lod = out->lod()[0];
-    framework::Vector<size_t> x_lod;
-    if (x.lod() == 1) {
-      x_lod = x.lod()[0];
-    } else {
-      x_lod.reserve(out_lod.size());
-      std::itoa(x_lod.begin(), x_lod.end(), 0);  // fill 0 ~ out_lod.size()-1
-    }
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
     int out_offset = 0;
     auto& eigen_place = *context.eigen_device();
-    for (size_t i = 1; i < out_lod.size(); ++i) {
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
       int x_start = x_lod[i - 1];
       int x_end = x_lod[i];
       int x_seq_len = x_end - x_start;
       if (repeat_num > 0) {
-        auto x_sub_tensor = x->Slice(x_start, x_end);
+        auto x_sub_tensor = x.Slice(x_start, x_end);
         x_sub_tensor.Resize({1, x_sub_tensor.numel()});
         int out_start = out_offset;
-        if (x_lod.size() == 1) {
-          out_start = out_lod[0][out_offset];
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
         }
         auto out_sub_tensor =
             out->Slice(out_start, out_start + x_seq_len * repeat_num);
@@ -71,6 +75,7 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
             EigenMatrix<T>::From(x_sub_tensor)
                 .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
       }
+      out_offset += repeat_num;
     }
   }
 };
@@ -96,13 +101,10 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
       return;
     }
 
-    auto& out_lod = *out->mutable_lod();
     // x lod level is at most 1.
-    if (x_lod.size() == 0) {
-      out_lod = y_lod[ref_level];
-    } else if (x_lod.size() == 1) {
-      out_lod.resize(1);
-      out_lod[0] = {0};
+    framework::Vector<size_t> out_lod;
+    if (x_lod.size() == 1) {
+      out_lod.push_back(0);
       int out_offset = 0;
       for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
         int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
@@ -110,14 +112,25 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
         int x_end = x_lod[0][i];
         int x_seq_len = x_end - x_start;
         for (int j = 0; j < repeat_num; ++j) {
-          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
+          out_lod.push_back(out_lod.back() + x_seq_len);
           out_offset++;
         }
       }
+      // write lod to out if x has lod
+      auto& ref_lod = *out->mutable_lod();
+      ref_lod[0] = out_lod;
+    }
+    framework::Vector<size_t> ref_x_lod;
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
     }
-
     SequenceExpandFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, out);
+    functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
+            y_lod[ref_level], out);
   }
 };
 
@@ -135,32 +148,29 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  * */
 template <typename T>
 struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x,
-                  const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, g_x, static_cast<T>(0));
-
-    int g_out_offset = 0;
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx) {
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context, dx, static_cast<T>(0));
+
+    int dout_offset = 0;
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
       if (repeat_num > 0) {
-        int x_start = i - 1;
-        int x_end = i;
-        if (x_lod.size() == 1) {
-          x_start = x_lod[0][i - 1];
-          x_end = x_lod[0][i];
-        }
+        int x_start = x_lod[i - 1];
+        int x_end = x_lod[i];
         int x_seq_len = x_end - x_start;
-        auto g_x_sub = g_x->Slice(x_start, x_end);
-        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
-        int g_out_end = g_out_offset + repeat_num * x_seq_len;
-        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
-        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
-        math::ColwiseSum<DeviceContext, T> col_sum;
-        col_sum(dev_ctx, g_out_sub, &g_x_sub);
-        g_out_offset += repeat_num * x_seq_len;
+        auto dx_sub = dx->Slice(x_start, x_end);
+        dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
+        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_sub = dout.Slice(dout_offset, dout_end);
+        dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
+        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        col_sum(context, dout_sub, &dx_sub);
+        dout_offset += repeat_num * x_seq_len;
       }
     }
   }
@@ -179,20 +189,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->mutable_data<T>(context.GetPlace());
     g_x->set_lod(x->lod());
 
-    auto& x_lod = x->lod();
     auto& y_lod = y->lod();
-
     if (ref_level == -1) ref_level = y_lod.size() - 1;
-
     // just copy the gradient
     if (y_lod[ref_level].size() <= 1) {
       framework::TensorCopy(*g_out, context.GetPlace(), g_x);
       return;
     }
 
+    framework::Vector<size_t> ref_x_lod;
+    framework::Vector<size_t> ref_lod = y_lod[ref_level];
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
+    }
     SequenceExpandGradFunctor<DeviceContext, T> functor;
-    functor(context.template device_context<DeviceContext>(), *x, *y, *g_out,
-            g_x);
+    functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
+            ref_lod, g_x);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index d1cebc4ea2df9..4c8ec1426c6e1 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -19,14 +19,8 @@
 
 class TestSequenceExpand(OpTest):
     def set_data(self):
-        x = [i / 10.0 for i in range(3)]
-        y = [i / 10.0 for i in range(8)]
-        x_data = np.array(x).reshape(3, 1).astype('float32')
-        y_data = np.array(y).reshape(8, 1).astype('float32')
-        print(x_data)
-        print(y_data)
-        # x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        # y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
         y_lod = [[0, 1, 4, 8]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
@@ -53,8 +47,10 @@ def compute(self):
             x_len = x_idx[i] - x_idx[i - 1]
             if repeat_num > 0:
                 x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
-                x_sub = np.repeat(x_sub, repeat_num, axis=0)
-                out = np.vstack((out, x_sub))
+                stacked_x_sub = x_sub
+                for r in range(repeat_num - 1):
+                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
+                out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
                     for j in xrange(repeat_num):
                         out_lod[0].append(out_lod[0][-1] + x_len)
@@ -107,11 +103,11 @@ def set_data(self):
 
 class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
-        data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
+        data = np.random.uniform(0.1, 1, [5 * 2, 1])
         x_data = np.array(data).reshape([5, 2]).astype('float32')
         x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
+        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_lod = [[0, 1, 3], [0, 1, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 

From 0412f5e09b9c1d13593a28b6a529affb26681141 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 28 Mar 2018 05:21:29 -0700
Subject: [PATCH 07/12] "fix ci"

---
 paddle/fluid/operators/sequence_expand_op.cu | 21 ++++++++------------
 paddle/fluid/operators/sequence_expand_op.h  |  2 --
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 1bd73426522bc..8a35bc908e8d7 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include <stdio.h>
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -109,12 +108,10 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
-    int x_item_length = 1;
-    x_item_length = x.numel() / x.dims()[0];
-    VLOG(0) << "x_item_length" << x_item_length;
-    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
-    int thread_y = std::max(1024 / thread_x, 16);
-    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int x_item_length = x.numel() / x.dims()[0];
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
@@ -133,12 +130,10 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Vector<size_t>& x_lod, /*expand source lod*/
                   const framework::Vector<size_t>& ref_lod, /*expand based lod*/
                   LoDTensor* dx) {
-    int x_item_length = 1;
-    x_item_length = framework::product(dx->dims()) / dx->dims()[0];
-
-    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
-    int thread_y = std::max(1024 / thread_x, 16);
-    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index c55c3e215abdf..d62c387c3eebf 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -15,8 +15,6 @@ limitations under the License. */
 #pragma once
 #include <numeric>  // std::iota
 
-#include <glog/logging.h>
-#include <sstream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/math_function.h"

From b661fe1d76514127581f2f73b177d2891677d39f Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 29 Mar 2018 01:36:34 -0700
Subject: [PATCH 08/12] "fix ci"

---
 python/paddle/fluid/tests/unittests/op_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 555f188abb9b8..8393f7827b1c7 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -362,9 +362,6 @@ def __assert_is_close(self, numeric_grads, analytic_grads, names,
         for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
-            print("actual", a)
-            print("*****")
-            print("expected", b)
 
             diff_mat = np.abs(a - b) / abs_a
             max_diff = np.max(diff_mat)

From fbdb5b7b437a55ce97fba37da5fdcbdd5e3e53bb Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Thu, 29 Mar 2018 19:20:50 -0700
Subject: [PATCH 09/12] "fix based on comment"

---
 paddle/fluid/operators/sequence_expand_op.cu | 68 +++++++++-----------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 8a35bc908e8d7..8119afce1a482 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -25,27 +25,17 @@ using LoDTensor = framework::LoDTensor;
 template <typename T>
 __global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
                                        const size_t* ref_lod,
+                                       const size_t* offset,
                                        const size_t lod_size,
                                        /* default=1,
                                           the instance length*/
                                        const int x_item_length, T* out_data) {
-  constexpr int N = 1024;
-  __shared__ int mem[N];
-  int offset = 0;
-  for (int i = 0; i < lod_size; ++i) {
-    mem[i] = offset;
-    if (i < lod_size - 1) {
-      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
-    }
-  }
-  __syncthreads();
-
   int bid = blockIdx.x;
   if (bid >= lod_size - 1) return;
 
   int x_item_count = x_lod[bid + 1] - x_lod[bid];
   int repeats = ref_lod[bid + 1] - ref_lod[bid];
-  int out_offset = mem[bid];
+  int out_offset = static_cast<int>(offset[bid]);
   int x_offset = x_lod[bid];
   for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
     for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
@@ -59,32 +49,17 @@ __global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
 }
 
 template <typename T>
-__global__ void sequence_expand_grad_kernel(const T* dout_data,
-                                            const size_t* ref_lod,
-                                            const size_t* dx_lod,
-                                            const size_t lod_size,
-                                            /* default=1,
-                                               the instance length*/
-                                            const int x_item_length,
-                                            T* dx_data) {
-  // TODO(dzhwinter) : too many atomicAdd
-  // use shared memory to reduce memory visits
-  constexpr int N = 1024;
-  __shared__ int mem[N];
-  int offset = 0;
-  for (int i = 0; i < lod_size; ++i) {
-    mem[i] = offset;
-    if (i < lod_size - 1) {
-      offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]);
-    }
-  }
-  __syncthreads();
-
+__global__ void sequence_expand_grad_kernel(
+    const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
+    const size_t* offset, const size_t lod_size,
+    /* default=1,
+       the instance length*/
+    const int x_item_length, T* dx_data) {
   int bid = blockIdx.x;
   if (bid >= lod_size - 1) return;
   int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
   int repeats = ref_lod[bid + 1] - ref_lod[bid];
-  int out_offset = mem[bid];
+  int out_offset = static_cast<int>(offset[bid]);
   int x_offset = dx_lod[bid];
 
   for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
@@ -101,6 +76,19 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
   }
 }
 
+void GetOutputOffset(const framework::Vector<size_t>& x_lod,
+                     const framework::Vector<size_t>& ref_lod,
+                     framework::Vector<size_t>& out_offset) {
+  size_t offset = 0;
+  int lod_size = static_cast<int>(x_lod.size());
+  for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
+    out_offset[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
+    }
+  }
+}
+
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
   void operator()(
@@ -109,6 +97,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
     int x_item_length = x.numel() / x.dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, out_offset);
+
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
     int thread_z = 1024 / thread_x / thread_y;
@@ -118,7 +109,8 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
 
     sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
         x.data<T>(), x_lod.CUDAData(context.GetPlace()),
-        ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        ref_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
         out->mutable_data<T>(context.GetPlace()));
   }
 };
@@ -131,6 +123,9 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Vector<size_t>& ref_lod, /*expand based lod*/
                   LoDTensor* dx) {
     int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, out_offset);
+
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
     int thread_z = 1024 / thread_x / thread_y;
@@ -139,7 +134,8 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size(block_x, 1);
     sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
         dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
-        x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        x_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
         dx->mutable_data<T>(context.GetPlace()));
   }
 };

From c72450d24d49c547d8e6bfc75691f429c19d6a79 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Mon, 2 Apr 2018 23:46:34 -0700
Subject: [PATCH 10/12] "seperate test"

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0ad273c716197..3873fda22665f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,6 +28,8 @@ function(py_test_modules TARGET_NAME)
   endif()
 endfunction()
 
+list(REMOVE_ITEM TEST_OPS test_sequence_expand)
+
 # test time consuming OPs in a separate process for expliot parallism
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
@@ -63,6 +65,8 @@ else()
     endforeach(TEST_OP)
 endif(WITH_FAST_BUNDLE_TEST)
 
+#
+py_test_modules(test_sequence_expand MODULES test_sequence_expand)
 # tests with high overhead
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)

From 80bd1ca01f62871b7e14fbdbe70482b3eeff9779 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 11 Apr 2018 01:31:59 -0700
Subject: [PATCH 11/12] "fix the style"

---
 paddle/fluid/operators/sequence_expand_op.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 8119afce1a482..111ccba2255fe 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -78,7 +77,7 @@ __global__ void sequence_expand_grad_kernel(
 
 void GetOutputOffset(const framework::Vector<size_t>& x_lod,
                      const framework::Vector<size_t>& ref_lod,
-                     framework::Vector<size_t>& out_offset) {
+                     framework::Vector<size_t>* out_offset) {
   size_t offset = 0;
   int lod_size = static_cast<int>(x_lod.size());
   for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
@@ -98,7 +97,7 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       LoDTensor* out) {
     int x_item_length = x.numel() / x.dims()[0];
     framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, out_offset);
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
 
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;
@@ -124,7 +123,7 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   LoDTensor* dx) {
     int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
     framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, out_offset);
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
 
     int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
     int thread_y = 16;

From 62d1f9a7cb9b850584fcd22d1c2b57f31174a13a Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 11 Apr 2018 01:44:57 -0700
Subject: [PATCH 12/12] "done"

---
 paddle/fluid/operators/sequence_expand_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 111ccba2255fe..c00765e5d59af 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -81,7 +81,7 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
   size_t offset = 0;
   int lod_size = static_cast<int>(x_lod.size());
   for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
-    out_offset[i] = offset;
+    (*out_offset)[i] = offset;
     if (i < lod_size - 1) {
       offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
     }