From 4ee1c9e60d278a5172c18549bfebbbe533fdfade Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 19 Mar 2018 19:07:57 -0700 Subject: [PATCH 01/12] "add sequence expand kernel" --- paddle/fluid/operators/sequence_expand_op.cu | 52 +++++++++++++++++++ paddle/fluid/operators/sequence_expand_op.h | 53 +++++++++++++------- 2 files changed, 86 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 26622d23afa1c..6477af89f110a 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -15,6 +15,58 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/fluid/operators/sequence_expand_op.h" +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +template +__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod, + size_t element_len) { + int BLOCK_SIZE = 1024; + __shared__ T shm_lod[BLOCK_SIZE]; + for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) { + shm_lod[idx] = lod[idx]; + } + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size(); + idx += blockDim.x * gridDim.x) { + int scale = lod[i] + } +} + +template +void SequenceExpandFunctor::operator()( + const platform::CPUDeviceContext& context, const LoDTensor& x, + LoDTensor* out) { + x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); + + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((out_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((out_cols + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + sequence_expand_kernel<<>>( + x.data(), out->mutable_data(context.GetPlace()), + out_starts.CUDAData(context.GetPlace()), element_len); +} + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sequence_expand, diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 76dde976db2d1..12e4018b95bbf 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -16,13 +16,44 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" -#include "unsupported/Eigen/CXX11/Tensor" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { using LoDTensor = framework::LoDTensor; +template +struct SequenceExpandFunctor { + void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out); +}; + +// template +// struct SequenceExpandGradFunctor {}; + +template +void SequenceExpandFunctor::operator()( + const platform::CPUDeviceContext& context, const LoDTensor& x, + LoDTensor* out) { + x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); + + for (size_t i = 0; i < out_starts.size() - 1; i++) { + int scale = out_starts[i + 1] - out_starts[i]; + Eigen::TensorMap< + Eigen::Tensor> + x_t(x_data, 1, element_len); + Eigen::TensorMap> + out_t(out_data, scale, element_len); + Eigen::array cast({{scale, 1}}); + out_t.device(*context.eigen_device()) = x_t.broadcast(cast); + x_data += element_len; + out_data += element_len * scale; + } +} + template class SequenceExpandKernel : public framework::OpKernel { public: @@ -38,24 +69,8 @@ class SequenceExpandKernel : public framework::OpKernel { "The size of last lod level in Input(Y)" "must be equal to dims[0] of Input(X)."); out->set_lod(y->lod()); - auto* place = - context.template device_context().eigen_device(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); - - for (size_t i = 0; i < out_starts.size() - 1; i++) { - int scale = out_starts[i + 1] - out_starts[i]; - Eigen::TensorMap< - Eigen::Tensor> - x_t(x_data, 1, element_len); - Eigen::TensorMap> - out_t(out_data, scale, element_len); - Eigen::array cast({{scale, 1}}); - out_t.device(*place) = x_t.broadcast(cast); - x_data += element_len; - out_data += element_len * scale; - } + SequenceExpandFunctor functor; + functor(context.template device_context(), *x, out); } }; From 26822bd774a99d19d5bb37f4890e82aacd57c391 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 20 Mar 2018 04:04:58 -0700 Subject: [PATCH 02/12] "add sequence kernel" --- paddle/fluid/operators/sequence_expand_op.cu | 107 +++++++++++++------ paddle/fluid/operators/sequence_expand_op.h | 86 ++++++++------- 2 files changed, 123 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 6477af89f110a..9cdb89f8fd8c8 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -21,48 +21,89 @@ namespace operators { using LoDTensor = framework::LoDTensor; template -__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod, - size_t element_len) { - int BLOCK_SIZE = 1024; - __shared__ T shm_lod[BLOCK_SIZE]; - for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) { - shm_lod[idx] = lod[idx]; +__global__ void sequence_expand_kernel(const T* x_data, T* out_data, + const size_t* lod, size_t lod_size, + size_t element_len) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < static_cast(lod_size - 1); + tid_x += blockDim.x * gridDim.x) { + int scale = lod[tid_x + 1] - lod[tid_x]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) { + int tid_z = blockIdx.z * blockDim.z + threadIdx.z; + int item_start = tid_x / element_len; + for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { + out_data[item_start * scale + tid_z] = x_data[item_start + tid_z]; + } + } } - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size(); - idx += blockDim.x * gridDim.x) { - int scale = lod[i] +} + +template +__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data, + const size_t* lod, size_t lod_size, + size_t element_len, + size_t dout_size) { + extern __shared__ T shm[]; + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < static_cast(lod_size - 1); + tid_x += blockDim.x * gridDim.x) { + int scale = lod[tid_x + 1] - lod[tid_x]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) { + int tid_z = blockIdx.z * blockDim.z + threadIdx.z; + int item_start = tid_x / element_len; + for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { + shm[item_start + tid_z] += doutx_data[item_start * scale + tid_z]; + } + } + } + // synchronize before write to dx + __syncthreads(); + for (int idx = blockDimx * blockIdx.x + threadIdx.x; + idx < static_cast(dout_size); idx += blockDim.x * gridDim.x) { + dx_data[idx] = shm[idx;] } } template -void SequenceExpandFunctor::operator()( - const platform::CPUDeviceContext& context, const LoDTensor& x, - LoDTensor* out) { - x_dims = x.dims(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); +struct SequenceExpandFunctor { + void operator()(const platform::CUDADeviceContext& context, + const LoDTensor& x, LoDTensor* out) { + auto x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); - const int kThreadsPerBlock = 1024; - int block_cols = kThreadsPerBlock; - if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((out_cols + 31) >> 5) << 5; + dim3 block_size(16, 32, element_len); + dim3 grid_size(10, 10); + sequence_expand_kernel<<>>( + x.data(), out->mutable_data(context.GetPlace()), + out_starts.CUDAData(context.GetPlace()), out_starts.size(), + element_len); } - int block_rows = kThreadsPerBlock / block_cols; - dim3 block_size = dim3(block_cols, block_rows, 1); +}; - int max_threads = context.GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); +template +struct SequenceExpandGradFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const LoDTensor& x, + const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) { + auto x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + const T* x_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); - int grid_cols = - std::min((out_cols + block_cols - 1) / block_cols, max_blocks); - int grid_rows = - std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); - dim3 grid_size = dim3(grid_cols, grid_rows, 1); - sequence_expand_kernel<<>>( - x.data(), out->mutable_data(context.GetPlace()), - out_starts.CUDAData(context.GetPlace()), element_len); -} + dim3 block_size(16, 32, element_len); + dim3 grid_size(10, 10); + size_t out_size = framework::product(dx->dims()); + sequence_expand_kernel<<>>( + dout.data(), dx->mutable_data(context.GetPlace()), + out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len, + out_size); + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 12e4018b95bbf..3b66bf3d8cbdc 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -28,31 +28,36 @@ struct SequenceExpandFunctor { void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out); }; -// template -// struct SequenceExpandGradFunctor {}; +template +struct SequenceExpandGradFunctor { + void operator()(const DeviceContext& ctx, const LoDTensor& x, + const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx); +}; template -void SequenceExpandFunctor::operator()( - const platform::CPUDeviceContext& context, const LoDTensor& x, - LoDTensor* out) { - x_dims = x.dims(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); +struct SequenceExpandFunctor { + void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x, + LoDTensor* out) { + auto x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + const T* x_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); - for (size_t i = 0; i < out_starts.size() - 1; i++) { - int scale = out_starts[i + 1] - out_starts[i]; - Eigen::TensorMap< - Eigen::Tensor> - x_t(x_data, 1, element_len); - Eigen::TensorMap> - out_t(out_data, scale, element_len); - Eigen::array cast({{scale, 1}}); - out_t.device(*context.eigen_device()) = x_t.broadcast(cast); - x_data += element_len; - out_data += element_len * scale; + for (size_t i = 0; i < out_starts.size() - 1; i++) { + int scale = out_starts[i + 1] - out_starts[i]; + Eigen::TensorMap< + Eigen::Tensor> + x_t(x_data, 1, element_len); + Eigen::TensorMap> + out_t(out_data, scale, element_len); + Eigen::array cast({{scale, 1}}); + out_t.device(*context.eigen_device()) = x_t.broadcast(cast); + x_data += element_len; + out_data += element_len * scale; + } } -} +}; template class SequenceExpandKernel : public framework::OpKernel { @@ -60,7 +65,6 @@ class SequenceExpandKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* x = context.Input("X"); auto* out = context.Output("Out"); - const T* x_data = x->data(); auto x_dims = x->dims(); auto* y = context.Input("Y"); PADDLE_ENFORCE(!y->lod().empty(), "y should have lod"); @@ -86,19 +90,14 @@ class SequenceExpandKernel : public framework::OpKernel { * Grad(X).lod = Input(X).lod * * */ -template -class SequenceExpandGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* x = context.Input("X"); - auto* out = context.Input("Out"); - auto* d_x = context.Output(framework::GradVarName("X")); - auto out_last_level = out->lod().back(); - d_x->set_lod(x->lod()); - const T* d_out_data = d_out->data(); +template +struct SequenceExpandGradFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const LoDTensor& x, + const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) { + auto out_last_level = out.lod().back(); + const T* d_out_data = d_out.data(); T* d_x_data = d_x->mutable_data(context.GetPlace()); - size_t element_len = d_out->numel() / d_out->dims()[0]; + size_t element_len = d_out.numel() / d_out.dims()[0]; for (size_t i = 0; i < out_last_level.size() - 1; ++i) { size_t repeat = out_last_level[i + 1] - out_last_level[i]; Eigen::TensorMap< @@ -106,14 +105,27 @@ class SequenceExpandGradKernel : public framework::OpKernel { d_out_t(d_out_data, static_cast(repeat), element_len); Eigen::TensorMap> d_x_t(d_x_data, static_cast(element_len)); - auto place = - context.template device_context().eigen_device(); - d_x_t.device(*place) = d_out_t.sum(Eigen::array({{0}})); + d_x_t.device(*context.eigen_device()) = + d_out_t.sum(Eigen::array({{0}})); d_out_data += (repeat * element_len); d_x_data += element_len; } } }; +template +class SequenceExpandGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* x = context.Input("X"); + auto* out = context.Input("Out"); + auto* d_x = context.Output(framework::GradVarName("X")); + d_x->set_lod(x->lod()); + SequenceExpandGradFunctor(context.template device_context(), *x, *out, + d_out, d_x); + } +}; + } // namespace operators } // namespace paddle From e4c35d837d79c4b1a4f30e42efe143f64ec10e71 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 20 Mar 2018 04:43:00 -0700 Subject: [PATCH 03/12] "add details" --- paddle/fluid/operators/sequence_expand_op.cu | 19 +++++++++---------- paddle/fluid/operators/sequence_expand_op.h | 18 ++++++++++-------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 9cdb89f8fd8c8..cae0a6928455b 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -54,15 +54,15 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data, int tid_z = blockIdx.z * blockDim.z + threadIdx.z; int item_start = tid_x / element_len; for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { - shm[item_start + tid_z] += doutx_data[item_start * scale + tid_z]; + shm[item_start + tid_z] += dout_data[item_start * scale + tid_z]; } } } // synchronize before write to dx __syncthreads(); - for (int idx = blockDimx * blockIdx.x + threadIdx.x; + for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < static_cast(dout_size); idx += blockDim.x * gridDim.x) { - dx_data[idx] = shm[idx;] + dx_data[idx] = shm[idx]; } } @@ -86,19 +86,18 @@ struct SequenceExpandFunctor { template struct SequenceExpandGradFunctor { - void operator()(const platform::CUDADeviceContext& ctx, const LoDTensor& x, - const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) { + void operator()(const platform::CUDADeviceContext& context, + const LoDTensor& x, const LoDTensor& out, + const LoDTensor& dout, LoDTensor* dx) { auto x_dims = x.dims(); size_t element_len = framework::product(x_dims) / x_dims[0]; - const T* x_data = x->data(); - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); + auto out_starts = out.lod().back(); dim3 block_size(16, 32, element_len); dim3 grid_size(10, 10); size_t out_size = framework::product(dx->dims()); - sequence_expand_kernel<<>>( + sequence_expand_grad_kernel<<>>( dout.data(), dx->mutable_data(context.GetPlace()), out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len, out_size); diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 3b66bf3d8cbdc..11890b30ae598 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -40,7 +40,7 @@ struct SequenceExpandFunctor { LoDTensor* out) { auto x_dims = x.dims(); size_t element_len = framework::product(x_dims) / x_dims[0]; - const T* x_data = x->data(); + const T* x_data = x.data(); T* out_data = out->mutable_data(context.GetPlace()); auto out_starts = out->lod().back(); @@ -92,12 +92,12 @@ class SequenceExpandKernel : public framework::OpKernel { * */ template struct SequenceExpandGradFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const LoDTensor& x, + void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x, const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) { auto out_last_level = out.lod().back(); - const T* d_out_data = d_out.data(); - T* d_x_data = d_x->mutable_data(context.GetPlace()); - size_t element_len = d_out.numel() / d_out.dims()[0]; + const T* d_out_data = dout.data(); + T* d_x_data = dx->mutable_data(context.GetPlace()); + size_t element_len = dout.numel() / dout.dims()[0]; for (size_t i = 0; i < out_last_level.size() - 1; ++i) { size_t repeat = out_last_level[i + 1] - out_last_level[i]; Eigen::TensorMap< @@ -117,13 +117,15 @@ template class SequenceExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* d_out = context.Input(framework::GradVarName("Out")); auto* x = context.Input("X"); auto* out = context.Input("Out"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); d_x->set_lod(x->lod()); - SequenceExpandGradFunctor(context.template device_context(), *x, *out, - d_out, d_x); + SequenceExpandGradFunctor functor; + functor(context.template device_context(), *x, *out, *d_out, + d_x); } }; From 53c8c36a04f92685f3fc380cbc41b9af1031de67 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 21 Mar 2018 05:49:53 -0700 Subject: [PATCH 04/12] "debug the process" --- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/operators/sequence_expand_op.cu | 128 ++++++++++++------ .../paddle/fluid/tests/unittests/op_test.py | 3 + .../tests/unittests/test_sequence_expand.py | 88 ++++++------ 4 files changed, 133 insertions(+), 88 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 7155d5ef2febc..5125072ddd3d5 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -44,7 +44,7 @@ struct ExecutorPrepareContext { ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id) : prog_(prog), block_id_(block_id) {} - const framework::ProgramDesc& prog_; + const framework::ProgramDesc prog_; size_t block_id_; std::vector> ops_; }; diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index cae0a6928455b..bf453ca7e8ea3 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #define EIGEN_USE_GPU +#include +#include #include "paddle/fluid/operators/sequence_expand_op.h" +#include "paddle/fluid/platform/cuda_helper.h" namespace paddle { namespace operators { @@ -22,47 +25,71 @@ using LoDTensor = framework::LoDTensor; template __global__ void sequence_expand_kernel(const T* x_data, T* out_data, - const size_t* lod, size_t lod_size, - size_t element_len) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < static_cast(lod_size - 1); - tid_x += blockDim.x * gridDim.x) { - int scale = lod[tid_x + 1] - lod[tid_x]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) { - int tid_z = blockIdx.z * blockDim.z + threadIdx.z; - int item_start = tid_x / element_len; - for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { - out_data[item_start * scale + tid_z] = x_data[item_start + tid_z]; - } + const size_t* lod, + const size_t* out_offset, + size_t lod_size, size_t element_len, + size_t x_size) { + int bid_x = blockIdx.x; + if (bid_x > lod_size) return; + int repeats = lod[bid_x]; + int offset = out_offset[bid_x]; + for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) { + for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) { + out_data[(offset + tid_y) * element_len + tid_x] = + x_data[bid_x * element_len + tid_x]; } } } template __global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data, - const size_t* lod, size_t lod_size, - size_t element_len, - size_t dout_size) { + const size_t* lod, + const size_t* out_offset, + size_t lod_size, size_t element_len, + size_t dout_size, size_t dx_size) { + // reduce visit memory time. + // dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1] + if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && + threadIdx.y == 0) { + printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n", + lod_size, element_len, dout_size, dx_size); + } extern __shared__ T shm[]; - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < static_cast(lod_size - 1); - tid_x += blockDim.x * gridDim.x) { - int scale = lod[tid_x + 1] - lod[tid_x]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < scale; tid_y += blockDim.y * gridDim.y) { - int tid_z = blockIdx.z * blockDim.z + threadIdx.z; - int item_start = tid_x / element_len; - for (; tid_z < element_len; tid_z += blockDim.z * gridDim.z) { - shm[item_start + tid_z] += dout_data[item_start * scale + tid_z]; - } + T* dout_shm = shm; + T* dx_shm = &shm[dout_size]; + + // int idx = threadIdx.x + blockIdx.x * blockDim.x; + for (int idx = 0; idx < dout_size; ++idx) { + if (idx < dx_size) { + dx_shm[idx] = 0.0; + } + if (idx < dout_size) { + dout_shm[idx] = dout_data[idx]; + } + } + + int bid_x = blockIdx.x; + if (bid_x > lod_size) return; + int repeats = lod[bid_x]; + int offset = out_offset[bid_x]; + if (threadIdx.x == 0) { + printf("repeats=%d, offset=%ld\n", repeats, offset); + } + for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) { + for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) { + T val = dout_shm[(offset + tid_y) * element_len + tid_x]; + platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val); + int dx_idx = bid_x * element_len + tid_x; + int dout_idx = (offset + tid_y) * element_len + tid_x; + printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n", + dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val); } } - // synchronize before write to dx __syncthreads(); - for (int idx = blockDim.x * blockIdx.x + threadIdx.x; - idx < static_cast(dout_size); idx += blockDim.x * gridDim.x) { - dx_data[idx] = shm[idx]; + // copy shared memory back to dx + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size; + idx += blockDim.x * gridDim.x) { + dx_data[idx] = dx_shm[idx]; } } @@ -72,15 +99,20 @@ struct SequenceExpandFunctor { const LoDTensor& x, LoDTensor* out) { auto x_dims = x.dims(); size_t element_len = framework::product(x_dims) / x_dims[0]; - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); + auto lod = out->lod().back(); + framework::Vector out_lod; + for (size_t i = 0; i < lod.size() - 1; ++i) { + out_lod.push_back(lod[i + 1] - lod[i]); + } - dim3 block_size(16, 32, element_len); - dim3 grid_size(10, 10); + int thread_x = std::max(static_cast(element_len), 32); + int block_x = static_cast(out_lod.size()); + dim3 block_size(thread_x, 1024 / thread_x); + dim3 grid_size(block_x, 1); sequence_expand_kernel<<>>( x.data(), out->mutable_data(context.GetPlace()), - out_starts.CUDAData(context.GetPlace()), out_starts.size(), - element_len); + out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()), + out_lod.size(), element_len, framework::product(x_dims)); } }; @@ -91,16 +123,24 @@ struct SequenceExpandGradFunctor { const LoDTensor& dout, LoDTensor* dx) { auto x_dims = x.dims(); size_t element_len = framework::product(x_dims) / x_dims[0]; - auto out_starts = out.lod().back(); + auto lod = out.lod().back(); + framework::Vector out_lod; + for (size_t i = 0; i < lod.size() - 1; ++i) { + out_lod.push_back(lod[i + 1] - lod[i]); + } + size_t dout_size = framework::product(dout.dims()); + size_t dx_size = framework::product(dx->dims()); - dim3 block_size(16, 32, element_len); - dim3 grid_size(10, 10); - size_t out_size = framework::product(dx->dims()); - sequence_expand_grad_kernel<<(element_len), 32); + dim3 block_size(thread_x, 1024 / thread_x); + int block_x = static_cast(out_lod.size()); + dim3 grid_size(block_x, 1); + sequence_expand_grad_kernel<<>>( dout.data(), dx->mutable_data(context.GetPlace()), - out_starts.CUDAData(context.GetPlace()), out_starts.size(), element_len, - out_size); + out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()), + out_lod.size(), element_len, dout_size, dx_size); } }; diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8393f7827b1c7..555f188abb9b8 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -362,6 +362,9 @@ def __assert_is_close(self, numeric_grads, analytic_grads, names, for a, b, name in itertools.izip(numeric_grads, analytic_grads, names): abs_a = np.abs(a) abs_a[abs_a < 1e-3] = 1 + print("actual", a) + print("*****") + print("expected", b) diff_mat = np.abs(a - b) / abs_a max_diff = np.max(diff_mat) diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py index 957fa5d2c4a79..f984127b4d64f 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py @@ -19,8 +19,14 @@ class TestSequenceExpand(OpTest): def set_data(self): - x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') - y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') + x = [i / 10.0 for i in range(3)] + y = [i / 10.0 for i in range(8)] + x_data = np.array(x).reshape(3, 1).astype('float32') + y_data = np.array(y).reshape(8, 1).astype('float32') + print(x_data) + print(y_data) + # x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') + # y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') y_lod = [[0, 1, 4, 8]] self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} @@ -45,47 +51,43 @@ def test_check_output(self): def test_check_grad(self): self.check_grad(["X"], "Out") - -class TestSequenceExpandCase1(TestSequenceExpand): - def set_data(self): - x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') - x_lod = [[0, 2, 5]] - y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32') - y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]] - self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} - - -class TestSequenceExpandCase2(TestSequenceExpand): - def set_data(self): - x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32') - x_lod = [[0, 1]] - y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32') - y_lod = [[0, 2]] - self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} - - -class TestSequenceExpandCase3(TestSequenceExpand): - def set_data(self): - x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32') - x_lod = [[0, 1, 2, 3, 4]] - y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32') - y_lod = [[0, 2, 4, 4, 6]] - self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} - - -class TestSequenceExpandCase4(TestSequenceExpand): - def set_data(self): - x_data = np.array( - [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape( - [2, 5]).astype('float32') - x_lod = [[ - 0, - 1, - 2, - ]] - y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32') - y_lod = [[0, 1, 2], [0, 1, 2]] - self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} + # class TestSequenceExpandCase1(TestSequenceExpand): + # def set_data(self): + # x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') + # x_lod = [[0, 2, 5]] + # y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32') + # y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]] + # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} + + # class TestSequenceExpandCase2(TestSequenceExpand): + # def set_data(self): + # x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32') + # x_lod = [[0, 1]] + # y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32') + # y_lod = [[0, 2]] + # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} + + # class TestSequenceExpandCase3(TestSequenceExpand): + # def set_data(self): + # x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32') + # x_lod = [[0, 1, 2, 3, 4]] + # y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32') + # y_lod = [[0, 2, 4, 4, 6]] + # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} + + # class TestSequenceExpandCase4(TestSequenceExpand): + # def set_data(self): + # x_data = np.array( + # [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape( + # [2, 5]).astype('float32') + # x_lod = [[ + # 0, + # 1, + # 2, + # ]] + # y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32') + # y_lod = [[0, 1, 2], [0, 1, 2]] + # self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} if __name__ == '__main__': From db1b128feb63a14514c2e38e344f6b464e1b7a68 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 26 Mar 2018 20:16:57 -0700 Subject: [PATCH 05/12] "add details" --- paddle/fluid/operators/sequence_expand_op.h | 161 ++++++++++++++------ 1 file changed, 114 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 11890b30ae598..5cab367988dd2 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -13,15 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include // std::itoa #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/operators/math/math_function.h" namespace paddle { namespace operators { using LoDTensor = framework::LoDTensor; +template +using EigenMatrix = framework::EigenMatrix; template struct SequenceExpandFunctor { @@ -38,23 +42,35 @@ template struct SequenceExpandFunctor { void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x, LoDTensor* out) { - auto x_dims = x.dims(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - const T* x_data = x.data(); - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); - - for (size_t i = 0; i < out_starts.size() - 1; i++) { - int scale = out_starts[i + 1] - out_starts[i]; - Eigen::TensorMap< - Eigen::Tensor> - x_t(x_data, 1, element_len); - Eigen::TensorMap> - out_t(out_data, scale, element_len); - Eigen::array cast({{scale, 1}}); - out_t.device(*context.eigen_device()) = x_t.broadcast(cast); - x_data += element_len; - out_data += element_len * scale; + auto& out_lod = out->lod()[0]; + framework::Vector x_lod; + if (x.lod() == 1) { + x_lod = x.lod()[0]; + } else { + x_lod.reserve(out_lod.size()); + std::itoa(x_lod.begin(), x_lod.end(), 0); // fill 0 ~ out_lod.size()-1 + } + int out_offset = 0; + auto& eigen_place = *context.eigen_device(); + for (size_t i = 1; i < out_lod.size(); ++i) { + int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; + int x_start = x_lod[i - 1]; + int x_end = x_lod[i]; + int x_seq_len = x_end - x_start; + if (repeat_num > 0) { + auto x_sub_tensor = x->Slice(x_start, x_end); + x_sub_tensor.Resize({1, x_sub_tensor.numel()}); + int out_start = out_offset; + if (x_lod.size() == 1) { + out_start = out_lod[0][out_offset]; + } + auto out_sub_tensor = + out->Slice(out_start, out_start + x_seq_len * repeat_num); + out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]}); + EigenMatrix::From(out_sub_tensor).device(eigen_place) = + EigenMatrix::From(x_sub_tensor) + .broadcast(Eigen::array({{repeat_num, 1}})); + } } } }; @@ -64,15 +80,42 @@ class SequenceExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto x_dims = x->dims(); auto* y = context.Input("Y"); - PADDLE_ENFORCE(!y->lod().empty(), "y should have lod"); - PADDLE_ENFORCE_EQ(static_cast(x_dims[0]), - y->lod().back().size() - 1, - "The size of last lod level in Input(Y)" - "must be equal to dims[0] of Input(X)."); - out->set_lod(y->lod()); + auto* out = context.Output("Out"); + + int ref_level = context.Attr("ref_level"); + auto& x_lod = x->lod(); + auto& y_lod = y->lod(); + + if (ref_level == -1) ref_level = y_lod.size() - 1; + + out->mutable_data(context.GetPlace()); + + if (y_lod[ref_level].size() <= 1) { + framework::TensorCopy(*x, context.GetPlace(), out); + return; + } + + auto& out_lod = *out->mutable_lod(); + // x lod level is at most 1. + if (x_lod.size() == 0) { + out_lod = y_lod[ref_level]; + } else if (x_lod.size() == 1) { + out_lod.resize(1); + out_lod[0] = {0}; + int out_offset = 0; + for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { + int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; + int x_start = x_lod[0][i - 1]; + int x_end = x_lod[0][i]; + int x_seq_len = x_end - x_start; + for (int j = 0; j < repeat_num; ++j) { + out_lod[0].push_back(out_lod[0].back() + x_seq_len); + out_offset++; + } + } + } + SequenceExpandFunctor functor; functor(context.template device_context(), *x, out); } @@ -94,21 +137,31 @@ template struct SequenceExpandGradFunctor { void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x, const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) { - auto out_last_level = out.lod().back(); - const T* d_out_data = dout.data(); - T* d_x_data = dx->mutable_data(context.GetPlace()); - size_t element_len = dout.numel() / dout.dims()[0]; - for (size_t i = 0; i < out_last_level.size() - 1; ++i) { - size_t repeat = out_last_level[i + 1] - out_last_level[i]; - Eigen::TensorMap< - Eigen::Tensor> - d_out_t(d_out_data, static_cast(repeat), element_len); - Eigen::TensorMap> - d_x_t(d_x_data, static_cast(element_len)); - d_x_t.device(*context.eigen_device()) = - d_out_t.sum(Eigen::array({{0}})); - d_out_data += (repeat * element_len); - d_x_data += element_len; + auto& dev_ctx = context.template device_context(); + + math::SetConstant set_zero; + set_zero(dev_ctx, g_x, static_cast(0)); + + int g_out_offset = 0; + for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { + int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; + if (repeat_num > 0) { + int x_start = i - 1; + int x_end = i; + if (x_lod.size() == 1) { + x_start = x_lod[0][i - 1]; + x_end = x_lod[0][i]; + } + int x_seq_len = x_end - x_start; + auto g_x_sub = g_x->Slice(x_start, x_end); + g_x_sub.Resize(flatten_to_1d(g_x_sub.dims())); + int g_out_end = g_out_offset + repeat_num * x_seq_len; + auto g_out_sub = g_out->Slice(g_out_offset, g_out_end); + g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]}); + math::ColwiseSum col_sum; + col_sum(dev_ctx, g_out_sub, &g_x_sub); + g_out_offset += repeat_num * x_seq_len; + } } } }; @@ -117,15 +170,29 @@ template class SequenceExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto* g_out = context.Input(framework::GradVarName("Out")); auto* x = context.Input("X"); - auto* out = context.Input("Out"); - auto* d_out = context.Input(framework::GradVarName("Out")); + auto* y = context.Input("Y"); + auto* g_x = context.Output(framework::GradVarName("X")); + int ref_level = context.Attr("ref_level"); + + g_x->mutable_data(context.GetPlace()); + g_x->set_lod(x->lod()); + + auto& x_lod = x->lod(); + auto& y_lod = y->lod(); + + if (ref_level == -1) ref_level = y_lod.size() - 1; + + // just copy the gradient + if (y_lod[ref_level].size() <= 1) { + framework::TensorCopy(*g_out, context.GetPlace(), g_x); + return; + } - auto* d_x = context.Output(framework::GradVarName("X")); - d_x->set_lod(x->lod()); SequenceExpandGradFunctor functor; - functor(context.template device_context(), *x, *out, *d_out, - d_x); + functor(context.template device_context(), *x, *y, *g_out, + g_x); } }; From 0be1e09f2c703c1479259ab68b06cc4bd1cb5c43 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 28 Mar 2018 02:34:34 -0700 Subject: [PATCH 06/12] "fix ci" --- paddle/fluid/operators/sequence_expand_op.cc | 5 +- paddle/fluid/operators/sequence_expand_op.cu | 193 +++++++++--------- paddle/fluid/operators/sequence_expand_op.h | 130 ++++++------ .../tests/unittests/test_sequence_expand.py | 22 +- 4 files changed, 183 insertions(+), 167 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc index 786fe63e7580c..ae52849162ae4 100644 --- a/paddle/fluid/operators/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_expand_op.cc @@ -84,12 +84,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel { } } out_dims[0] = out_first_dim; - ctx->SetOutputDim("Out", out_dims); } else { out_dims[0] = -1; - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); } + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); } }; diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 743e3bbc297c5..1bd73426522bc 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -24,123 +24,128 @@ namespace operators { using LoDTensor = framework::LoDTensor; template -__global__ void sequence_expand_kernel(const T* x_data, T* out_data, - const size_t* lod, - const size_t* out_offset, - size_t lod_size, size_t element_len, - size_t x_size) { - int bid_x = blockIdx.x; - if (bid_x > lod_size) return; - int repeats = lod[bid_x]; - int offset = out_offset[bid_x]; - for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) { - for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) { - out_data[(offset + tid_y) * element_len + tid_x] = - x_data[bid_x * element_len + tid_x]; +__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod, + const size_t* ref_lod, + const size_t lod_size, + /* default=1, + the instance length*/ + const int x_item_length, T* out_data) { + constexpr int N = 1024; + __shared__ int mem[N]; + int offset = 0; + for (int i = 0; i < lod_size; ++i) { + mem[i] = offset; + if (i < lod_size - 1) { + offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]); } } -} + __syncthreads(); -template -__global__ void sequence_expand_grad_kernel(const T* dout_data, T* dx_data, - const size_t* lod, - const size_t* out_offset, - size_t lod_size, size_t element_len, - size_t dout_size, size_t dx_size) { - // reduce visit memory time. - // dout_shm = [0 - dout_size-1], dx_shm = [dout_size-1, dout_size + dx_size-1] - if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && - threadIdx.y == 0) { - printf("lod_size=%ld, element_size=%ld, dout_size=%ld, dx_size=%ld\n", - lod_size, element_len, dout_size, dx_size); - } - extern __shared__ T shm[]; - T* dout_shm = shm; - T* dx_shm = &shm[dout_size]; - - // int idx = threadIdx.x + blockIdx.x * blockDim.x; - for (int idx = 0; idx < dout_size; ++idx) { - if (idx < dx_size) { - dx_shm[idx] = 0.0; - } - if (idx < dout_size) { - dout_shm[idx] = dout_data[idx]; + int bid = blockIdx.x; + if (bid >= lod_size - 1) return; + + int x_item_count = x_lod[bid + 1] - x_lod[bid]; + int repeats = ref_lod[bid + 1] - ref_lod[bid]; + int out_offset = mem[bid]; + int x_offset = x_lod[bid]; + for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) { + for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { + for (int tid_x = threadIdx.x; tid_x < x_item_length; + tid_x += blockDim.x) { + out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length + + tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x]; + } } } +} - int bid_x = blockIdx.x; - if (bid_x > lod_size) return; - int repeats = lod[bid_x]; - int offset = out_offset[bid_x]; - if (threadIdx.x == 0) { - printf("repeats=%d, offset=%ld\n", repeats, offset); - } - for (int tid_y = threadIdx.y; tid_y < repeats; tid_y += blockDim.y) { - for (int tid_x = threadIdx.x; tid_x < element_len; tid_x += blockDim.x) { - T val = dout_shm[(offset + tid_y) * element_len + tid_x]; - platform::CudaAtomicAdd(&dx_shm[bid_x * element_len + tid_x], val); - int dx_idx = bid_x * element_len + tid_x; - int dout_idx = (offset + tid_y) * element_len + tid_x; - printf("dx_idx=%d, dout_idx=%d, dx_data=%f, dout_data=%f, val=%f \n", - dx_idx, dout_idx, dx_shm[dx_idx], dout_shm[dout_idx], val); +template +__global__ void sequence_expand_grad_kernel(const T* dout_data, + const size_t* ref_lod, + const size_t* dx_lod, + const size_t lod_size, + /* default=1, + the instance length*/ + const int x_item_length, + T* dx_data) { + // TODO(dzhwinter) : too many atomicAdd + // use shared memory to reduce memory visits + constexpr int N = 1024; + __shared__ int mem[N]; + int offset = 0; + for (int i = 0; i < lod_size; ++i) { + mem[i] = offset; + if (i < lod_size - 1) { + offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]); } } __syncthreads(); - // copy shared memory back to dx - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < dx_size; - idx += blockDim.x * gridDim.x) { - dx_data[idx] = dx_shm[idx]; + + int bid = blockIdx.x; + if (bid >= lod_size - 1) return; + int x_item_count = dx_lod[bid + 1] - dx_lod[bid]; + int repeats = ref_lod[bid + 1] - ref_lod[bid]; + int out_offset = mem[bid]; + int x_offset = dx_lod[bid]; + + for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) { + for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { + for (int tid_x = threadIdx.x; tid_x < x_item_length; + tid_x += blockDim.x) { + platform::CudaAtomicAdd( + &dx_data[(x_offset + tid_y) * x_item_length + tid_x], + dout_data[(out_offset + tid_z * x_item_count + tid_y) * + x_item_length + + tid_x]); + } + } } } template struct SequenceExpandFunctor { - void operator()(const platform::CUDADeviceContext& context, - const LoDTensor& x, LoDTensor* out) { - auto x_dims = x.dims(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - auto lod = out->lod().back(); - framework::Vector out_lod; - for (size_t i = 0; i < lod.size() - 1; ++i) { - out_lod.push_back(lod[i + 1] - lod[i]); - } - - int thread_x = std::max(static_cast(element_len), 32); - int block_x = static_cast(out_lod.size()); - dim3 block_size(thread_x, 1024 / thread_x); + void operator()( + const platform::CUDADeviceContext& context, const LoDTensor& x, + const framework::Vector& x_lod, /*expand source lod*/ + const framework::Vector& ref_lod, /*expand referenced lod*/ + LoDTensor* out) { + int x_item_length = 1; + x_item_length = x.numel() / x.dims()[0]; + VLOG(0) << "x_item_length" << x_item_length; + int thread_x = std::max(static_cast(ref_lod.size()), 32); + int thread_y = std::max(1024 / thread_x, 16); + int thread_z = std::min(1024 / thread_x / thread_y, 16); + int block_x = static_cast(ref_lod.size()); + dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); + sequence_expand_kernel<<>>( - x.data(), out->mutable_data(context.GetPlace()), - out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()), - out_lod.size(), element_len, framework::product(x_dims)); + x.data(), x_lod.CUDAData(context.GetPlace()), + ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length, + out->mutable_data(context.GetPlace())); } }; template struct SequenceExpandGradFunctor { void operator()(const platform::CUDADeviceContext& context, - const LoDTensor& x, const LoDTensor& out, - const LoDTensor& dout, LoDTensor* dx) { - auto x_dims = x.dims(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - auto lod = out.lod().back(); - framework::Vector out_lod; - for (size_t i = 0; i < lod.size() - 1; ++i) { - out_lod.push_back(lod[i + 1] - lod[i]); - } - size_t dout_size = framework::product(dout.dims()); - size_t dx_size = framework::product(dx->dims()); - - int thread_x = std::max(static_cast(element_len), 32); - dim3 block_size(thread_x, 1024 / thread_x); - int block_x = static_cast(out_lod.size()); + const LoDTensor& dout, + const framework::Vector& x_lod, /*expand source lod*/ + const framework::Vector& ref_lod, /*expand based lod*/ + LoDTensor* dx) { + int x_item_length = 1; + x_item_length = framework::product(dx->dims()) / dx->dims()[0]; + + int thread_x = std::max(static_cast(ref_lod.size()), 32); + int thread_y = std::max(1024 / thread_x, 16); + int thread_z = std::min(1024 / thread_x / thread_y, 16); + int block_x = static_cast(ref_lod.size()); + dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); - sequence_expand_grad_kernel<<>>( - dout.data(), dx->mutable_data(context.GetPlace()), - out_lod.CUDAData(context.GetPlace()), lod.CUDAData(context.GetPlace()), - out_lod.size(), element_len, dout_size, dx_size); + sequence_expand_grad_kernel<<>>( + dout.data(), ref_lod.CUDAData(context.GetPlace()), + x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length, + dx->mutable_data(context.GetPlace())); } }; diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 5cab367988dd2..c55c3e215abdf 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include // std::itoa +#include // std::iota +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/math/math_function.h" @@ -29,40 +31,42 @@ using EigenMatrix = framework::EigenMatrix; template struct SequenceExpandFunctor { - void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out); + void operator()( + const DeviceContext& ctx, const LoDTensor& x, + const framework::Vector& x_lod, /*expand source lod*/ + const framework::Vector& ref_lod, /*expand referenced lod*/ + LoDTensor* out); }; template struct SequenceExpandGradFunctor { - void operator()(const DeviceContext& ctx, const LoDTensor& x, - const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx); + void operator()( + const DeviceContext& ctx, const LoDTensor& dout, + const framework::Vector& x_lod, /*expand source lod*/ + const framework::Vector& ref_lod, /*expand referenced lod*/ + LoDTensor* dx); }; template struct SequenceExpandFunctor { - void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x, - LoDTensor* out) { - auto& out_lod = out->lod()[0]; - framework::Vector x_lod; - if (x.lod() == 1) { - x_lod = x.lod()[0]; - } else { - x_lod.reserve(out_lod.size()); - std::itoa(x_lod.begin(), x_lod.end(), 0); // fill 0 ~ out_lod.size()-1 - } + void operator()( + const platform::CPUDeviceContext& context, const LoDTensor& x, + const framework::Vector& x_lod, /*expand source lod*/ + const framework::Vector& ref_lod, /*expand referenced lod*/ + LoDTensor* out) { int out_offset = 0; auto& eigen_place = *context.eigen_device(); - for (size_t i = 1; i < out_lod.size(); ++i) { - int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; + for (size_t i = 1; i < ref_lod.size(); ++i) { + int repeat_num = ref_lod[i] - ref_lod[i - 1]; int x_start = x_lod[i - 1]; int x_end = x_lod[i]; int x_seq_len = x_end - x_start; if (repeat_num > 0) { - auto x_sub_tensor = x->Slice(x_start, x_end); + auto x_sub_tensor = x.Slice(x_start, x_end); x_sub_tensor.Resize({1, x_sub_tensor.numel()}); int out_start = out_offset; - if (x_lod.size() == 1) { - out_start = out_lod[0][out_offset]; + if (out->lod().size() == 1) { + out_start = out->lod()[0][out_offset]; } auto out_sub_tensor = out->Slice(out_start, out_start + x_seq_len * repeat_num); @@ -71,6 +75,7 @@ struct SequenceExpandFunctor { EigenMatrix::From(x_sub_tensor) .broadcast(Eigen::array({{repeat_num, 1}})); } + out_offset += repeat_num; } } }; @@ -96,13 +101,10 @@ class SequenceExpandKernel : public framework::OpKernel { return; } - auto& out_lod = *out->mutable_lod(); // x lod level is at most 1. - if (x_lod.size() == 0) { - out_lod = y_lod[ref_level]; - } else if (x_lod.size() == 1) { - out_lod.resize(1); - out_lod[0] = {0}; + framework::Vector out_lod; + if (x_lod.size() == 1) { + out_lod.push_back(0); int out_offset = 0; for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; @@ -110,14 +112,25 @@ class SequenceExpandKernel : public framework::OpKernel { int x_end = x_lod[0][i]; int x_seq_len = x_end - x_start; for (int j = 0; j < repeat_num; ++j) { - out_lod[0].push_back(out_lod[0].back() + x_seq_len); + out_lod.push_back(out_lod.back() + x_seq_len); out_offset++; } } + // write lod to out if x has lod + auto& ref_lod = *out->mutable_lod(); + ref_lod[0] = out_lod; + } + framework::Vector ref_x_lod; + if (x->lod().size() == 1) { + ref_x_lod = x->lod()[0]; + } else { + // x_lod doesn't has lod, use fake x lod, level = 0 + ref_x_lod.resize(x->dims()[0] + 1); + std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0); } - SequenceExpandFunctor functor; - functor(context.template device_context(), *x, out); + functor(context.template device_context(), *x, ref_x_lod, + y_lod[ref_level], out); } }; @@ -135,32 +148,29 @@ class SequenceExpandKernel : public framework::OpKernel { * */ template struct SequenceExpandGradFunctor { - void operator()(const platform::CPUDeviceContext& context, const LoDTensor& x, - const LoDTensor& out, const LoDTensor& dout, LoDTensor* dx) { - auto& dev_ctx = context.template device_context(); - - math::SetConstant set_zero; - set_zero(dev_ctx, g_x, static_cast(0)); - - int g_out_offset = 0; - for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { - int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; + void operator()( + const platform::CPUDeviceContext& context, const LoDTensor& dout, + const framework::Vector& x_lod, /*expand source lod*/ + const framework::Vector& ref_lod, /*expand referenced lod*/ + LoDTensor* dx) { + math::SetConstant set_zero; + set_zero(context, dx, static_cast(0)); + + int dout_offset = 0; + for (size_t i = 1; i < ref_lod.size(); ++i) { + int repeat_num = ref_lod[i] - ref_lod[i - 1]; if (repeat_num > 0) { - int x_start = i - 1; - int x_end = i; - if (x_lod.size() == 1) { - x_start = x_lod[0][i - 1]; - x_end = x_lod[0][i]; - } + int x_start = x_lod[i - 1]; + int x_end = x_lod[i]; int x_seq_len = x_end - x_start; - auto g_x_sub = g_x->Slice(x_start, x_end); - g_x_sub.Resize(flatten_to_1d(g_x_sub.dims())); - int g_out_end = g_out_offset + repeat_num * x_seq_len; - auto g_out_sub = g_out->Slice(g_out_offset, g_out_end); - g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]}); - math::ColwiseSum col_sum; - col_sum(dev_ctx, g_out_sub, &g_x_sub); - g_out_offset += repeat_num * x_seq_len; + auto dx_sub = dx->Slice(x_start, x_end); + dx_sub.Resize(flatten_to_1d(dx_sub.dims())); + int dout_end = dout_offset + repeat_num * x_seq_len; + auto dout_sub = dout.Slice(dout_offset, dout_end); + dout_sub.Resize({repeat_num, dx_sub.dims()[0]}); + math::ColwiseSum col_sum; + col_sum(context, dout_sub, &dx_sub); + dout_offset += repeat_num * x_seq_len; } } } @@ -179,20 +189,26 @@ class SequenceExpandGradKernel : public framework::OpKernel { g_x->mutable_data(context.GetPlace()); g_x->set_lod(x->lod()); - auto& x_lod = x->lod(); auto& y_lod = y->lod(); - if (ref_level == -1) ref_level = y_lod.size() - 1; - // just copy the gradient if (y_lod[ref_level].size() <= 1) { framework::TensorCopy(*g_out, context.GetPlace(), g_x); return; } + framework::Vector ref_x_lod; + framework::Vector ref_lod = y_lod[ref_level]; + if (x->lod().size() == 1) { + ref_x_lod = x->lod()[0]; + } else { + // x_lod doesn't has lod, use fake x lod, level = 0 + ref_x_lod.resize(x->dims()[0] + 1); + std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0); + } SequenceExpandGradFunctor functor; - functor(context.template device_context(), *x, *y, *g_out, - g_x); + functor(context.template device_context(), *g_out, ref_x_lod, + ref_lod, g_x); } }; diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py index d1cebc4ea2df9..4c8ec1426c6e1 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py @@ -19,14 +19,8 @@ class TestSequenceExpand(OpTest): def set_data(self): - x = [i / 10.0 for i in range(3)] - y = [i / 10.0 for i in range(8)] - x_data = np.array(x).reshape(3, 1).astype('float32') - y_data = np.array(y).reshape(8, 1).astype('float32') - print(x_data) - print(y_data) - # x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') - # y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') + x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') + y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') y_lod = [[0, 1, 4, 8]] self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} @@ -53,8 +47,10 @@ def compute(self): x_len = x_idx[i] - x_idx[i - 1] if repeat_num > 0: x_sub = x_data[x_idx[i - 1]:x_idx[i], :] - x_sub = np.repeat(x_sub, repeat_num, axis=0) - out = np.vstack((out, x_sub)) + stacked_x_sub = x_sub + for r in range(repeat_num - 1): + stacked_x_sub = np.vstack((stacked_x_sub, x_sub)) + out = np.vstack((out, stacked_x_sub)) if x_lod is not None: for j in xrange(repeat_num): out_lod[0].append(out_lod[0][-1] + x_len) @@ -107,11 +103,11 @@ def set_data(self): class TestSequenceExpandCase4(TestSequenceExpand): def set_data(self): - data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3] + data = np.random.uniform(0.1, 1, [5 * 2, 1]) x_data = np.array(data).reshape([5, 2]).astype('float32') x_lod = [[0, 2, 5]] - y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32') - y_lod = [[0, 1, 2], [0, 1, 2]] + y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') + y_lod = [[0, 1, 3], [0, 1, 3]] self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} From 0412f5e09b9c1d13593a28b6a529affb26681141 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 28 Mar 2018 05:21:29 -0700 Subject: [PATCH 07/12] "fix ci" --- paddle/fluid/operators/sequence_expand_op.cu | 21 ++++++++------------ paddle/fluid/operators/sequence_expand_op.h | 2 -- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 1bd73426522bc..8a35bc908e8d7 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #define EIGEN_USE_GPU -#include #include #include "paddle/fluid/operators/sequence_expand_op.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -109,12 +108,10 @@ struct SequenceExpandFunctor { const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ LoDTensor* out) { - int x_item_length = 1; - x_item_length = x.numel() / x.dims()[0]; - VLOG(0) << "x_item_length" << x_item_length; - int thread_x = std::max(static_cast(ref_lod.size()), 32); - int thread_y = std::max(1024 / thread_x, 16); - int thread_z = std::min(1024 / thread_x / thread_y, 16); + int x_item_length = x.numel() / x.dims()[0]; + int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); + int thread_y = 16; + int thread_z = 1024 / thread_x / thread_y; int block_x = static_cast(ref_lod.size()); dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); @@ -133,12 +130,10 @@ struct SequenceExpandGradFunctor { const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand based lod*/ LoDTensor* dx) { - int x_item_length = 1; - x_item_length = framework::product(dx->dims()) / dx->dims()[0]; - - int thread_x = std::max(static_cast(ref_lod.size()), 32); - int thread_y = std::max(1024 / thread_x, 16); - int thread_z = std::min(1024 / thread_x / thread_y, 16); + int x_item_length = framework::product(dx->dims()) / dx->dims()[0]; + int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); + int thread_y = 16; + int thread_z = 1024 / thread_x / thread_y; int block_x = static_cast(ref_lod.size()); dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index c55c3e215abdf..d62c387c3eebf 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -15,8 +15,6 @@ limitations under the License. */ #pragma once #include // std::iota -#include -#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/math/math_function.h" From b661fe1d76514127581f2f73b177d2891677d39f Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 29 Mar 2018 01:36:34 -0700 Subject: [PATCH 08/12] "fix ci" --- python/paddle/fluid/tests/unittests/op_test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 555f188abb9b8..8393f7827b1c7 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -362,9 +362,6 @@ def __assert_is_close(self, numeric_grads, analytic_grads, names, for a, b, name in itertools.izip(numeric_grads, analytic_grads, names): abs_a = np.abs(a) abs_a[abs_a < 1e-3] = 1 - print("actual", a) - print("*****") - print("expected", b) diff_mat = np.abs(a - b) / abs_a max_diff = np.max(diff_mat) From fbdb5b7b437a55ce97fba37da5fdcbdd5e3e53bb Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 29 Mar 2018 19:20:50 -0700 Subject: [PATCH 09/12] "fix based on comment" --- paddle/fluid/operators/sequence_expand_op.cu | 68 +++++++++----------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 8a35bc908e8d7..8119afce1a482 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -25,27 +25,17 @@ using LoDTensor = framework::LoDTensor; template __global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod, const size_t* ref_lod, + const size_t* offset, const size_t lod_size, /* default=1, the instance length*/ const int x_item_length, T* out_data) { - constexpr int N = 1024; - __shared__ int mem[N]; - int offset = 0; - for (int i = 0; i < lod_size; ++i) { - mem[i] = offset; - if (i < lod_size - 1) { - offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]); - } - } - __syncthreads(); - int bid = blockIdx.x; if (bid >= lod_size - 1) return; int x_item_count = x_lod[bid + 1] - x_lod[bid]; int repeats = ref_lod[bid + 1] - ref_lod[bid]; - int out_offset = mem[bid]; + int out_offset = static_cast(offset[bid]); int x_offset = x_lod[bid]; for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) { for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { @@ -59,32 +49,17 @@ __global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod, } template -__global__ void sequence_expand_grad_kernel(const T* dout_data, - const size_t* ref_lod, - const size_t* dx_lod, - const size_t lod_size, - /* default=1, - the instance length*/ - const int x_item_length, - T* dx_data) { - // TODO(dzhwinter) : too many atomicAdd - // use shared memory to reduce memory visits - constexpr int N = 1024; - __shared__ int mem[N]; - int offset = 0; - for (int i = 0; i < lod_size; ++i) { - mem[i] = offset; - if (i < lod_size - 1) { - offset += (ref_lod[i + 1] - ref_lod[i]) * (dx_lod[i + 1] - dx_lod[i]); - } - } - __syncthreads(); - +__global__ void sequence_expand_grad_kernel( + const T* dout_data, const size_t* ref_lod, const size_t* dx_lod, + const size_t* offset, const size_t lod_size, + /* default=1, + the instance length*/ + const int x_item_length, T* dx_data) { int bid = blockIdx.x; if (bid >= lod_size - 1) return; int x_item_count = dx_lod[bid + 1] - dx_lod[bid]; int repeats = ref_lod[bid + 1] - ref_lod[bid]; - int out_offset = mem[bid]; + int out_offset = static_cast(offset[bid]); int x_offset = dx_lod[bid]; for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) { @@ -101,6 +76,19 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, } } +void GetOutputOffset(const framework::Vector& x_lod, + const framework::Vector& ref_lod, + framework::Vector& out_offset) { + size_t offset = 0; + int lod_size = static_cast(x_lod.size()); + for (int i = 0; i < static_cast(x_lod.size()); ++i) { + out_offset[i] = offset; + if (i < lod_size - 1) { + offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]); + } + } +} + template struct SequenceExpandFunctor { void operator()( @@ -109,6 +97,9 @@ struct SequenceExpandFunctor { const framework::Vector& ref_lod, /*expand referenced lod*/ LoDTensor* out) { int x_item_length = x.numel() / x.dims()[0]; + framework::Vector out_offset(x_lod.size()); + GetOutputOffset(x_lod, ref_lod, out_offset); + int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); int thread_y = 16; int thread_z = 1024 / thread_x / thread_y; @@ -118,7 +109,8 @@ struct SequenceExpandFunctor { sequence_expand_kernel<<>>( x.data(), x_lod.CUDAData(context.GetPlace()), - ref_lod.CUDAData(context.GetPlace()), x_lod.size(), x_item_length, + ref_lod.CUDAData(context.GetPlace()), + out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length, out->mutable_data(context.GetPlace())); } }; @@ -131,6 +123,9 @@ struct SequenceExpandGradFunctor { const framework::Vector& ref_lod, /*expand based lod*/ LoDTensor* dx) { int x_item_length = framework::product(dx->dims()) / dx->dims()[0]; + framework::Vector out_offset(x_lod.size()); + GetOutputOffset(x_lod, ref_lod, out_offset); + int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); int thread_y = 16; int thread_z = 1024 / thread_x / thread_y; @@ -139,7 +134,8 @@ struct SequenceExpandGradFunctor { dim3 grid_size(block_x, 1); sequence_expand_grad_kernel<<>>( dout.data(), ref_lod.CUDAData(context.GetPlace()), - x_lod.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length, + x_lod.CUDAData(context.GetPlace()), + out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length, dx->mutable_data(context.GetPlace())); } }; From c72450d24d49c547d8e6bfc75691f429c19d6a79 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 2 Apr 2018 23:46:34 -0700 Subject: [PATCH 10/12] "seperate test" --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0ad273c716197..3873fda22665f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,6 +28,8 @@ function(py_test_modules TARGET_NAME) endif() endfunction() +list(REMOVE_ITEM TEST_OPS test_sequence_expand) + # test time consuming OPs in a separate process for expliot parallism list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dyn_rnn) @@ -63,6 +65,8 @@ else() endforeach(TEST_OP) endif(WITH_FAST_BUNDLE_TEST) +# +py_test_modules(test_sequence_expand MODULES test_sequence_expand) # tests with high overhead py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR}) py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn) From 80bd1ca01f62871b7e14fbdbe70482b3eeff9779 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 11 Apr 2018 01:31:59 -0700 Subject: [PATCH 11/12] "fix the style" --- paddle/fluid/operators/sequence_expand_op.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 8119afce1a482..111ccba2255fe 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include #include "paddle/fluid/operators/sequence_expand_op.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -78,7 +77,7 @@ __global__ void sequence_expand_grad_kernel( void GetOutputOffset(const framework::Vector& x_lod, const framework::Vector& ref_lod, - framework::Vector& out_offset) { + framework::Vector* out_offset) { size_t offset = 0; int lod_size = static_cast(x_lod.size()); for (int i = 0; i < static_cast(x_lod.size()); ++i) { @@ -98,7 +97,7 @@ struct SequenceExpandFunctor { LoDTensor* out) { int x_item_length = x.numel() / x.dims()[0]; framework::Vector out_offset(x_lod.size()); - GetOutputOffset(x_lod, ref_lod, out_offset); + GetOutputOffset(x_lod, ref_lod, &out_offset); int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); int thread_y = 16; @@ -124,7 +123,7 @@ struct SequenceExpandGradFunctor { LoDTensor* dx) { int x_item_length = framework::product(dx->dims()) / dx->dims()[0]; framework::Vector out_offset(x_lod.size()); - GetOutputOffset(x_lod, ref_lod, out_offset); + GetOutputOffset(x_lod, ref_lod, &out_offset); int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); int thread_y = 16; From 62d1f9a7cb9b850584fcd22d1c2b57f31174a13a Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 11 Apr 2018 01:44:57 -0700 Subject: [PATCH 12/12] "done" --- paddle/fluid/operators/sequence_expand_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 111ccba2255fe..c00765e5d59af 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -81,7 +81,7 @@ void GetOutputOffset(const framework::Vector& x_lod, size_t offset = 0; int lod_size = static_cast(x_lod.size()); for (int i = 0; i < static_cast(x_lod.size()); ++i) { - out_offset[i] = offset; + (*out_offset)[i] = offset; if (i < lod_size - 1) { offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]); }