From 6cc4bd536f1c9862bca6e3104cab4b3daf843e1e Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 26 Dec 2017 14:37:47 +0800 Subject: [PATCH 1/8] wip --- paddle/operators/adam_op.h | 120 ++++++++++++++++-- python/paddle/v2/fluid/tests/test_adam_op.py | 125 +++++++++++++++++++ 2 files changed, 232 insertions(+), 13 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index c4e2c8bb88ec9..aa58c4f9908b5 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -79,6 +79,71 @@ struct AdamFunctor { } }; +template +struct SparseAdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t height_; + + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel, int64_t height) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel), + height_(height) {} + + inline HOSTDEVICE void operator()(size_t i) const { + for (int64_t j = 0; j < row_numel_; ++j) { + T g = grad_[i * row_numel_ + j]; + T mom1 = moment1_[rows_[i] * row_numel_ + j]; + T mom2 = moment2_[rows_[i] * row_numel_ + j]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[rows_[i] * row_numel_ + j]; + + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // FIXME(typhoonzero): row id may be duplicate + moment1_out_[rows_[i] * row_numel_ + j] = mom1; + moment2_out_[rows_[i] * row_numel_ + j] = mom2; + param_out_[rows_[i] * row_numel_ + j] = p; + } // for col id + } +}; + template class AdamOpKernel : public framework::OpKernel { public: @@ -90,7 +155,8 @@ class AdamOpKernel : public framework::OpKernel { T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); auto& param = Ref(ctx.Input("Param"), "Must set Param"); - auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto* grad_var = ctx.InputVar("Grad"); auto& mom1 = Ref(ctx.Input("Moment1"), "Must set Moment1"); auto& mom2 = Ref(ctx.Input("Moment2"), "Must set Moment2"); auto& lr = @@ -108,18 +174,46 @@ class AdamOpKernel : public framework::OpKernel { auto& mom2_out = Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); - AdamFunctor functor(beta1, beta2, epsilon, beta1_pow.template data(), - beta2_pow.template data(), - mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - lr.template data(), grad.template data(), - param.template data(), - param_out.template mutable_data(ctx.GetPlace())); - platform::ForRange for_range( - static_cast(ctx.device_context()), param.numel()); - for_range(functor); + if (grad_var->IsType()) { + auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + AdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } else if (grad_var->IsType()) { + auto& grad = + Ref(ctx.Input("Grad"), "Must set Grad"); + auto& grad_tensor = grad.value(); + const T* grad_data = grad_tensor.template data(); + auto* rows = grad.rows().data(); + auto height = grad.height(); + auto row_numel = grad_tensor.numel() / height; + + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + height); + platform::ForRange for_range( + static_cast(ctx.device_context()), + grad.rows().size()); + for_range(functor); + } else { + PADDLE_THROW("Variable type not supported by adam_op"); + } } }; diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index a0d6655d4cbcf..a66fd33102720 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -176,5 +176,130 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out +def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): + ''' + Simulate one step of the adam optimizer + :param inputs: dict of inputs + :param attributes: dict of attributes + :return tuple: tuple of output param, moment1, moment2, + beta1 power accumulator and beta2 power accumulator + ''' + param = inputs['Param'] + # grad = inputs['Grad'] + moment1 = inputs['Moment1'] + moment2 = inputs['Moment2'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + beta2_pow = inputs['Beta2Pow'] + + beta1 = attributes['beta1'] + beta2 = attributes['beta2'] + epsilon = attributes['epsilon'] + + moment1_out = np.array([height, row_numel]) + moment2_out = np.array([height, row_numel]) + param_out = np.array([height, row_numel]) + + for idx, row_id in enumerate(rows): + moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 + ) * np_grad[idx] + moment2_out[row_id] = beta2 * moment2[row_id] + ( + 1 - beta2) * np.square(np_grad[idx]) + lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) + param_out[row_id] = param[row_id] - lr_t * (moment1_out / ( + np.sqrt(moment2_out) + epsilon)) + return param_out, moment1_out, moment2_out + + +class TestSparseAdamOp(unittest.TestCase): + def setup(self, scope, place): + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + + height = 10 + rows = [0, 4, 7] + row_numel = 12 + self.dense_inputs = { + "Param": np.full((height, row_numel), 5.0).astype("float32"), + "Moment1": np.full((height, row_numel), 5.0).astype("float32"), + "Moment2": np.full((height, row_numel), 5.0).astype("float32"), + 'Beta1Pow': np.array([0.9**10]).astype("float32"), + 'Beta2Pow': np.array([0.999**10]).astype("float32"), + "LearningRate": np.full((1), 2.0).astype("float32") + } + self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + self.sparse_inputs = ["Grad"] + + param_out, mom1, mom2 = adam_step_sparse( + self.dense_inputs, self.attrs, height, rows, row_numel, np_array) + self.outputs = { + "Param": param_out, + "Moment1Out": mom1, + "Moment2Out": mom2 + } + + def check_with_place(self, place): + scope = core.Scope() + self.setup(scope, place) + + op_args = dict() + for key, np_array in self.dense_inputs.iteritems(): + var = scope.var(key).get_tensor() + var.set(np_array, place) + op_args[key] = key + for s in self.sparse_inputs: + op_args[s] = s + for k in self.attrs: + op_args[k] = self.attrs[k] + + # create and run sgd operator + sgd_op = Operator("adam", **op_args) + sgd_op.run(scope, place) + + for key, np_array in self.outputs.iteritems(): + out_var = scope.var(key).get_tensor() + actual = np.array(out_var) + actual.reshape([actual.size()]) + np_array.reshape([np_array.size()]) + i = 0 + while i < actual.size(): + self.assertAlmostEqual(actual[i], np_array[i]) + i += 1 + + # # rows[0] = 0, 5.0 - 2.0 * 2.0 + # self.assertAlmostEqual(1.0, result_array[rows[0], 0]) + # # rows[0] = 0, 5.0 - 2.0 * 1.0 + # self.assertAlmostEqual(3.0, result_array[rows[0], 2]) + # # 5.0 - 2.0 * 0.0 + # self.assertAlmostEqual(5.0, result_array[1, 0]) + # # rows[1] = 4, 5.0 - 2.0 * 1.0 + # self.assertAlmostEqual(3.0, result_array[rows[1], 10]) + # # 5.0 - 2.0 * 0.0 + # self.assertAlmostEqual(5.0, result_array[5, 8]) + # # rows[2] = 7, 5.0 - 2.0 * 1.0 + # self.assertAlmostEqual(3.0, result_array[rows[2], 1]) + # # rows[2] = 7, 5.0 - 2.0 * 4.0 + # self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) + + def test_sparse_sgd(self): + places = [core.CPUPlace()] + if core.is_compile_gpu(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main() From 5361911c689e1368adc4c8b0c86ea44c310796dc Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 26 Dec 2017 21:23:08 +0800 Subject: [PATCH 2/8] adam support sparse --- paddle/operators/adam_op.h | 13 ++--- python/paddle/v2/fluid/tests/test_adam_op.py | 58 +++++++++----------- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index aa58c4f9908b5..5facd0112f14b 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -98,13 +98,12 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; - int64_t height_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t height) + int64_t row_numel) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -119,8 +118,7 @@ struct SparseAdamFunctor { param_(param), param_out_(param_out), rows_(rows), - row_numel_(row_numel), - height_(height) {} + row_numel_(row_numel) {} inline HOSTDEVICE void operator()(size_t i) const { for (int64_t j = 0; j < row_numel_; ++j) { @@ -136,6 +134,7 @@ struct SparseAdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // IMPORTANT: // FIXME(typhoonzero): row id may be duplicate moment1_out_[rows_[i] * row_numel_ + j] = mom1; moment2_out_[rows_[i] * row_numel_ + j] = mom2; @@ -195,8 +194,7 @@ class AdamOpKernel : public framework::OpKernel { auto& grad_tensor = grad.value(); const T* grad_data = grad_tensor.template data(); auto* rows = grad.rows().data(); - auto height = grad.height(); - auto row_numel = grad_tensor.numel() / height; + auto row_numel = grad_tensor.numel() / grad.rows().size(); SparseAdamFunctor functor( beta1, beta2, epsilon, beta1_pow.template data(), @@ -205,8 +203,7 @@ class AdamOpKernel : public framework::OpKernel { mom2.template data(), mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), - param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - height); + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); platform::ForRange for_range( static_cast(ctx.device_context()), grad.rows().size()); diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index a66fd33102720..996fcfe49d064 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -1,6 +1,8 @@ import unittest import numpy as np from op_test import OpTest +from paddle.v2.fluid import core +from paddle.v2.fluid.op import Operator class TestAdamOp1(OpTest): @@ -196,9 +198,9 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): beta2 = attributes['beta2'] epsilon = attributes['epsilon'] - moment1_out = np.array([height, row_numel]) - moment2_out = np.array([height, row_numel]) - param_out = np.array([height, row_numel]) + moment1_out = np.zeros(shape=[height, row_numel]) + moment2_out = np.zeros(shape=[height, row_numel]) + param_out = np.zeros(shape=[height, row_numel]) for idx, row_id in enumerate(rows): moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 @@ -206,8 +208,8 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): moment2_out[row_id] = beta2 * moment2[row_id] + ( 1 - beta2) * np.square(np_grad[idx]) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) - param_out[row_id] = param[row_id] - lr_t * (moment1_out / ( - np.sqrt(moment2_out) + epsilon)) + param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( + np.sqrt(moment2_out[row_id]) + epsilon)) return param_out, moment1_out, moment2_out @@ -219,13 +221,15 @@ def setup(self, scope, place): height = 10 rows = [0, 4, 7] + self.rows = rows row_numel = 12 + self.row_numel = row_numel self.dense_inputs = { "Param": np.full((height, row_numel), 5.0).astype("float32"), "Moment1": np.full((height, row_numel), 5.0).astype("float32"), "Moment2": np.full((height, row_numel), 5.0).astype("float32"), - 'Beta1Pow': np.array([0.9**10]).astype("float32"), - 'Beta2Pow': np.array([0.999**10]).astype("float32"), + 'Beta1Pow': np.array([beta1**10]).astype("float32"), + 'Beta2Pow': np.array([beta2**10]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} @@ -245,7 +249,7 @@ def setup(self, scope, place): param_out, mom1, mom2 = adam_step_sparse( self.dense_inputs, self.attrs, height, rows, row_numel, np_array) self.outputs = { - "Param": param_out, + "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } @@ -261,37 +265,29 @@ def check_with_place(self, place): op_args[key] = key for s in self.sparse_inputs: op_args[s] = s + for s in self.outputs: + var = scope.var(s).get_tensor() + var.set(self.outputs[s], place) + op_args[s] = s for k in self.attrs: op_args[k] = self.attrs[k] # create and run sgd operator - sgd_op = Operator("adam", **op_args) - sgd_op.run(scope, place) + adam_op = Operator("adam", **op_args) + adam_op.run(scope, place) for key, np_array in self.outputs.iteritems(): out_var = scope.var(key).get_tensor() actual = np.array(out_var) - actual.reshape([actual.size()]) - np_array.reshape([np_array.size()]) - i = 0 - while i < actual.size(): - self.assertAlmostEqual(actual[i], np_array[i]) - i += 1 - - # # rows[0] = 0, 5.0 - 2.0 * 2.0 - # self.assertAlmostEqual(1.0, result_array[rows[0], 0]) - # # rows[0] = 0, 5.0 - 2.0 * 1.0 - # self.assertAlmostEqual(3.0, result_array[rows[0], 2]) - # # 5.0 - 2.0 * 0.0 - # self.assertAlmostEqual(5.0, result_array[1, 0]) - # # rows[1] = 4, 5.0 - 2.0 * 1.0 - # self.assertAlmostEqual(3.0, result_array[rows[1], 10]) - # # 5.0 - 2.0 * 0.0 - # self.assertAlmostEqual(5.0, result_array[5, 8]) - # # rows[2] = 7, 5.0 - 2.0 * 1.0 - # self.assertAlmostEqual(3.0, result_array[rows[2], 1]) - # # rows[2] = 7, 5.0 - 2.0 * 4.0 - # self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) + actual = actual.reshape([actual.size]) + np_array = np_array.reshape([np_array.size]) + for idx, row_id in enumerate(self.rows): + j = 0 + while j < self.row_numel: + pos = row_id * self.row_numel + j + print (actual[pos] - np_array[pos]) / actual[pos] + self.assertLess((actual[pos] - np_array[pos]) / actual[pos], 0.00001) + j += 1 def test_sparse_sgd(self): places = [core.CPUPlace()] From dd21ae6c1ee3b681bfd069760448fead207964ee Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 09:58:51 +0800 Subject: [PATCH 3/8] update --- python/paddle/v2/fluid/tests/test_adam_op.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index 996fcfe49d064..3758ca457e5db 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -285,8 +285,9 @@ def check_with_place(self, place): j = 0 while j < self.row_numel: pos = row_id * self.row_numel + j - print (actual[pos] - np_array[pos]) / actual[pos] - self.assertLess((actual[pos] - np_array[pos]) / actual[pos], 0.00001) + print(actual[pos] - np_array[pos]) / actual[pos] + self.assertLess((actual[pos] - np_array[pos]) / actual[pos], + 0.00001) j += 1 def test_sparse_sgd(self): From d48a0e4eae939f3615fabc9f86f11670fcfad6e3 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 21:04:51 +0800 Subject: [PATCH 4/8] WIP: adding generic scattor functors --- .../operators/math/selected_rows_functor.cc | 47 +++++++++++++ .../operators/math/selected_rows_functor.cu | 67 +++++++++++++++++++ paddle/operators/math/selected_rows_functor.h | 47 +++++++++++++ 3 files changed, 161 insertions(+) diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index ab758d1e7fd8a..21418ba4b0201 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -179,6 +179,53 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; +// This is a separated namespace for manipulate SelectedRows typed +// data. Like merge duplicated rows, adding two SelectedRows etc. +// +// Another group of functors is called "scatter updates", which means +// use SelectedRows to update a dense tensor with different Ops, like +// add or mul. +namespace scatter { + +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} + +template +struct MergeAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* out) { + auto input_rows = input.rows(); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + // std::unique_ptr out{ + // new framework::SelectedRows()}; + out->set_rows(merge_rows); + out->set_height(input.height()); + out->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out->mutable_value(), 0.0); + + auto* out_data = out->mutable_value()->data(); + auto* input_data = input.value().data(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = FindPos(merge_rows, input_rows[i]); + for (int64_t j = 0; j < input_width; j++) { + out_data[out_i * input_width + j] += input_data[i * input_width + j]; + } + } + } +}; + +} // namespace scatter } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index 9fddd97a36f7f..b2c0fe7bc3da7 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -222,6 +222,73 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; + +namespace scatter { + +template +__global__ void MergeAddKernel(const T* input, const int64_t* input_rows, + T* out, const int64_t* out_rows, + size_t out_rows_size, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t out_idx; + + if (tid == 0) { + for (size_t i = 0; i < out_rows_size; i++) { + if (input_rows[ty] == out_rows[i]) { + out_idx = i; + } + } + } + + __syncthreads(); + + input += ty * row_numel; + out += out_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(out + index, input[index]); + } +} + +template +struct MergeAdd { + void operator()(const platform::GPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* out) { + auto input_rows = input.rows(); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + // std::unique_ptr out{ + // new framework::SelectedRows()}; + out->set_rows(merge_rows); + out->set_height(input.height()); + out->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out->mutable_value(), 0.0); + + auto* out_data = out->mutable_value()->data(); + auto* input_data = input.value().data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid1(1, input_rows.size()); + + MergeAddKernel< + T, 256><<(context) + .stream()>>>(input_data, input.rows().data(), out_data, + out->rows().data(), out->rows().size(), + input_width); + } +}; + +} // namespace scatter } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index 1149075abf165..8adfca77f6930 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -52,6 +52,53 @@ struct SelectedRowsAddToTensor { framework::Tensor* input2); }; +namespace scatter { +// functors for manuplating SelectedRows data + +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + void operator()(const DeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* out); +}; + +template +struct Add { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* out) { + out->set_rows(input1->rows()); + out->set_height(input1->height()); + out->mutable_value()->mutable_data(input1->value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1->value()); + auto e_in2 = framework::EigenVector::Flatten(input2->value()); + e_out.device(*context.eigen_device()) = e_in1 + e_in2; + } +}; + +template +struct Mul { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* out) { + out->set_rows(input1->rows()); + out->set_height(input1->height()); + out->mutable_value()->mutable_data(input1->value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1->value()); + auto e_in2 = framework::EigenVector::Flatten(input2->value()); + e_out.device(*context.eigen_device()) = e_in1 * e_in2; + } +}; + +} // namespace scatter } // namespace math } // namespace operators } // namespace paddle From 74b122889cbce2aa3add92784d0b4a621abfdf45 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 21:08:40 +0800 Subject: [PATCH 5/8] wip --- paddle/operators/math/selected_rows_functor.h | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index 8adfca77f6930..eecd5e5362bbb 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/eigen.h" #include "paddle/framework/selected_rows.h" #include "paddle/platform/device_context.h" @@ -70,13 +71,13 @@ struct Add { const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* out) { - out->set_rows(input1->rows()); - out->set_height(input1->height()); - out->mutable_value()->mutable_data(input1->value().dims(), + out->set_rows(input1.rows()); + out->set_height(input1.height()); + out->mutable_value()->mutable_data(input1.value().dims(), context.GetPlace()); auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); - auto e_in1 = framework::EigenVector::Flatten(input1->value()); - auto e_in2 = framework::EigenVector::Flatten(input2->value()); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 + e_in2; } }; @@ -87,13 +88,13 @@ struct Mul { const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* out) { - out->set_rows(input1->rows()); - out->set_height(input1->height()); - out->mutable_value()->mutable_data(input1->value().dims(), + out->set_rows(input1.rows()); + out->set_height(input1.height()); + out->mutable_value()->mutable_data(input1.value().dims(), context.GetPlace()); auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); - auto e_in1 = framework::EigenVector::Flatten(input1->value()); - auto e_in2 = framework::EigenVector::Flatten(input2->value()); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 * e_in2; } }; From 641b4c0fe6db944ffe47a3dbd8a88c7a966c41f1 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 29 Dec 2017 10:49:28 +0800 Subject: [PATCH 6/8] wip --- paddle/operators/adagrad_op.cc | 44 ++------ paddle/operators/adagrad_op.cu | 48 ++------ paddle/operators/adam_op.h | 17 ++- .../operators/math/selected_rows_functor.cc | 90 +++++++++++++-- .../operators/math/selected_rows_functor.cu | 106 ++++++++++++++++-- paddle/operators/math/selected_rows_functor.h | 74 ++++++++---- python/paddle/v2/fluid/tests/test_adam_op.py | 1 - 7 files changed, 251 insertions(+), 129 deletions(-) diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index 052c793a01907..c83318a272302 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -105,48 +105,18 @@ struct SparseAdagradFunctor { const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) - auto grad_rows = grad.rows(); - std::set row_set(grad_rows.begin(), grad_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); - auto grad_width = grad.value().dims()[1]; - std::unique_ptr grad_merge{ - new framework::SelectedRows()}; - grad_merge->set_rows(merge_rows); - grad_merge->set_height(grad.height()); - grad_merge->mutable_value()->mutable_data( - framework::make_ddim( - {static_cast(merge_rows.size()), grad_width}), - context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, grad_merge->mutable_value(), 0.0); - - auto* grad_merge_data = grad_merge->mutable_value()->data(); - auto* grad_data = grad.value().data(); - - for (size_t i = 0; i < grad_rows.size(); i++) { - size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]); - for (int64_t j = 0; j < grad_width; j++) { - grad_merge_data[grad_merge_i * grad_width + j] += - grad_data[i * grad_width + j]; - } - } + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto& merge_rows = grad_merge.rows(); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); // 2. m += g_m * g_m - std::unique_ptr grad_square{ - new framework::SelectedRows()}; - grad_square->set_rows(grad_merge->rows()); - grad_square->set_height(grad_merge->height()); - grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), - context.GetPlace()); - auto gs = - framework::EigenVector::Flatten(*(grad_square->mutable_value())); - auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.eigen_device()) = gm * gm; + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); math::SelectedRowsAddToTensor functor; - functor(context, *grad_square, moment); + functor(context, grad_square, moment); // 3. update parameter auto* lr = learning_rate.data(); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 585b2d92894af..86b3dd860d9a6 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -78,51 +78,17 @@ struct SparseAdagradFunctor { const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) - auto grad_rows = grad.rows(); - std::set row_set(grad_rows.begin(), grad_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); - auto grad_width = grad.value().dims()[1]; - std::unique_ptr grad_merge{ - new framework::SelectedRows()}; - grad_merge->set_rows(merge_rows); - grad_merge->set_height(grad.height()); - grad_merge->mutable_value()->mutable_data( - framework::make_ddim( - {static_cast(merge_rows.size()), grad_width}), - context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, grad_merge->mutable_value(), 0.0); - - auto* grad_merge_data = grad_merge->mutable_value()->data(); - auto* grad_data = grad.value().data(); - - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid1(1, grad_rows.size()); - - MergeGradKernel< - T, 256><<(context) - .stream()>>>(grad_data, grad.rows().data(), - grad_merge_data, grad_merge->rows().data(), - grad_merge->rows().size(), grad_width); - + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); + auto& merge_rows = grad_merge.rows; // 2. m += g_m * g_m - std::unique_ptr grad_square{ - new framework::SelectedRows()}; - grad_square->set_rows(grad_merge->rows()); - grad_square->set_height(grad_merge->height()); - grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), - context.GetPlace()); - auto gs = - framework::EigenVector::Flatten(*(grad_square->mutable_value())); - auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.eigen_device()) = gm * gm; + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); math::SelectedRowsAddToTensor functor; - functor(context, *grad_square, moment); + functor(context, grad_square, moment); // 3. update parameter auto* lr = learning_rate.data(); diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 5facd0112f14b..3c4148ccc0a7d 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -16,11 +16,14 @@ limitations under the License. */ #include // for sqrt in CPU and CUDA #include "paddle/framework/op_registry.h" #include "paddle/operators/detail/safe_ref.h" +#include "paddle/operators/math/selected_rows_functor.h" #include "paddle/platform/for_range.h" namespace paddle { namespace operators { +namespace scatter = paddle::operators::math::scatter; + template struct AdamFunctor { T beta1_; @@ -134,8 +137,6 @@ struct SparseAdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - // IMPORTANT: - // FIXME(typhoonzero): row id may be duplicate moment1_out_[rows_[i] * row_numel_ + j] = mom1; moment2_out_[rows_[i] * row_numel_ + j] = mom2; param_out_[rows_[i] * row_numel_ + j] = p; @@ -191,10 +192,14 @@ class AdamOpKernel : public framework::OpKernel { } else if (grad_var->IsType()) { auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); - auto& grad_tensor = grad.value(); + // merge duplicated rows if any. + scatter::MergeAdd merge_func; + auto grad_merge = + merge_func(ctx.template device_context(), grad); + auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - auto* rows = grad.rows().data(); - auto row_numel = grad_tensor.numel() / grad.rows().size(); + auto* rows = grad_merge.rows().data(); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); SparseAdamFunctor functor( beta1, beta2, epsilon, beta1_pow.template data(), @@ -206,7 +211,7 @@ class AdamOpKernel : public framework::OpKernel { param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); platform::ForRange for_range( static_cast(ctx.device_context()), - grad.rows().size()); + grad_merge.rows().size()); for_range(functor); } else { PADDLE_THROW("Variable type not supported by adam_op"); diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index 21418ba4b0201..c9f3c10c61700 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/math/selected_rows_functor.h" +#include + #include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/selected_rows_functor.h" namespace paddle { namespace operators { @@ -193,27 +195,25 @@ size_t FindPos(const std::vector& rows, int64_t value) { template struct MergeAdd { - void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* out) { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; auto input_rows = input.rows(); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); auto input_width = input.value().dims()[1]; - // std::unique_ptr out{ - // new framework::SelectedRows()}; - out->set_rows(merge_rows); - out->set_height(input.height()); - out->mutable_value()->mutable_data( + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); math::SetConstant constant_functor; - constant_functor(context, out->mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), 0.0); - auto* out_data = out->mutable_value()->data(); + auto* out_data = out.mutable_value()->data(); auto* input_data = input.value().data(); for (size_t i = 0; i < input_rows.size(); i++) { @@ -222,6 +222,74 @@ struct MergeAdd { out_data[out_i * input_width + j] += input_data[i * input_width + j]; } } + return out; + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +struct UpdateToTensor { + framework::Tensor operator()(const platform::CPUDeviceContext& context, + const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::ADD: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUB: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] -= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUBBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] - + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + case ScatterOps::MUL: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] *= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIV: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] /= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIVBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] / + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + } } }; diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index b2c0fe7bc3da7..48413403db513 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -252,27 +252,26 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, template struct MergeAdd { - void operator()(const platform::GPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* out) { + framework::SelectedRows operator()(const platform::GPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; auto input_rows = input.rows(); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); auto input_width = input.value().dims()[1]; - // std::unique_ptr out{ - // new framework::SelectedRows()}; - out->set_rows(merge_rows); - out->set_height(input.height()); - out->mutable_value()->mutable_data( + + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); math::SetConstant constant_functor; - constant_functor(context, out->mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), 0.0); - auto* out_data = out->mutable_value()->data(); + auto* out_data = out.mutable_value()->data(); auto* input_data = input.value().data(); const int block_size = 256; @@ -283,11 +282,96 @@ struct MergeAdd { T, 256><<(context) .stream()>>>(input_data, input.rows().data(), out_data, - out->rows().data(), out->rows().size(), + out.rows().data(), out.rows().size(), input_width); + return out; } }; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +__global__ void UpdateToTensorKernel(const T* selected_rows, + const int64_t* rows, const ScatterOps& op, + T* tensor_out, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index]; + } + break; + case ScatterOps::ADD: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] += selected_rows[index]; + } + break; + case ScatterOps::SUB: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] -= selected_rows[index]; + } + break; + case ScatterOps::SUBBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] - tensor_out[index]; + } + break; + case ScatterOps::MUL: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] *= selected_rows[index]; + } + break; + case ScatterOps::DIV: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] /= selected_rows[index]; + } + break; + case ScatterOps::DIVBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] / tensor_out[index]; + } + break; + } +} + +template +struct UpdateToTensor { + framework::Tensor operator()(const platform::GPUDeviceContext& context, + const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + // NOTE: Use SelectedRowsAddToTensor for better performance + // no additional MergeAdd called. + auto merged_in1 = MergeAdd()(context, input1); + + auto in1_height = merged_in1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = merged_in1.value(); + auto& in1_rows = merged_in1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + dim3 threads(PADDLE_CUDA_NUM_THREADS, 1); + dim3 grid(1, in1_rows.size()); + UpdateToTensorKernel< + T, PADDLE_CUDA_NUM_THREADS><<>>( + in1_data, in1_rows.data(), op, in2_data, in1_row_numel); + } +}; } // namespace scatter } // namespace math } // namespace operators diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index eecd5e5362bbb..d4bef72980f83 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -16,6 +16,10 @@ limitations under the License. */ #include "paddle/framework/selected_rows.h" #include "paddle/platform/device_context.h" +#define INLINE_FOR2(sizei, sizej) \ + for (int64_t i = 0; i < sizei; i++) \ + for (int64_t j = 0; j < sizej; j++) + namespace paddle { namespace operators { namespace math { @@ -55,50 +59,76 @@ struct SelectedRowsAddToTensor { namespace scatter { // functors for manuplating SelectedRows data - template struct MergeAdd { // unary functor, merge by adding duplicated rows in // the input SelectedRows object. - void operator()(const DeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* out); + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input); }; template struct Add { - void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* out) { - out->set_rows(input1.rows()); - out->set_height(input1.height()); - out->mutable_value()->mutable_data(input1.value().dims(), - context.GetPlace()); - auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); auto e_in1 = framework::EigenVector::Flatten(input1.value()); auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 + e_in2; + return out; } }; template struct Mul { - void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* out) { - out->set_rows(input1.rows()); - out->set_height(input1.height()); - out->mutable_value()->mutable_data(input1.value().dims(), - context.GetPlace()); - auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + // multiply two SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); auto e_in1 = framework::EigenVector::Flatten(input1.value()); auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 * e_in2; + return out; + } + // multiply scalar to SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const T input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + e_out.device(*context.eigen_device()) = input2 * e_in1; + return out; } }; +enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; + +// out = seleted_rows_in / tensor +template +struct UpdateToTensor { + framework::Tensor operator()(const DeviceContext& context, + const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + } // namespace scatter } // namespace math } // namespace operators diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index 3758ca457e5db..7dbc2fa0858a6 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -285,7 +285,6 @@ def check_with_place(self, place): j = 0 while j < self.row_numel: pos = row_id * self.row_numel + j - print(actual[pos] - np_array[pos]) / actual[pos] self.assertLess((actual[pos] - np_array[pos]) / actual[pos], 0.00001) j += 1 From 1039c1e3b7b391963fe2e4f1dba22d3358104a98 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 29 Dec 2017 13:51:41 +0800 Subject: [PATCH 7/8] scatter optimizers --- paddle/operators/adagrad_op.cu | 10 +++-- .../operators/math/selected_rows_functor.cc | 7 ++-- .../operators/math/selected_rows_functor.cu | 38 ++++++++++--------- paddle/operators/math/selected_rows_functor.h | 7 ++-- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 86b3dd860d9a6..fed2e29367de5 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -79,12 +79,12 @@ struct SparseAdagradFunctor { framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) auto grad_width = grad.value().dims()[1]; - math::scatter::MergeAdd merge_func; + math::scatter::MergeAdd merge_func; auto grad_merge = merge_func(context, grad); auto* grad_merge_data = grad_merge.mutable_value()->template data(); - auto& merge_rows = grad_merge.rows; + auto& merge_rows = grad_merge.rows(); // 2. m += g_m * g_m - math::scatter::Mul sqare_func; + math::scatter::Mul sqare_func; auto grad_square = sqare_func(context, grad_merge, grad_merge); math::SelectedRowsAddToTensor functor; @@ -95,11 +95,13 @@ struct SparseAdagradFunctor { auto* param_data = param->data(); auto* moment_data = moment->data(); + const int block_size = 256; + dim3 threads(block_size, 1); dim3 grid2(1, merge_rows.size()); SparseAdagradFunctorKernel< T, 256><<(context) - .stream()>>>(grad_merge_data, grad_merge->rows().data(), + .stream()>>>(grad_merge_data, grad_merge.rows().data(), lr, param_data, moment_data, grad_width, epsilon); } diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index c9f3c10c61700..8a1ebb58c2657 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -233,10 +233,9 @@ template struct MergeAdd; template struct UpdateToTensor { - framework::Tensor operator()(const platform::CPUDeviceContext& context, - const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2) { + void operator()(const platform::CPUDeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index 48413403db513..0ee456f9bc614 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/selected_rows_functor.h" #include "paddle/platform/cuda_helper.h" @@ -251,8 +253,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, } template -struct MergeAdd { - framework::SelectedRows operator()(const platform::GPUDeviceContext& context, +struct MergeAdd { + framework::SelectedRows operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input) { framework::SelectedRows out; auto input_rows = input.rows(); @@ -288,10 +290,10 @@ struct MergeAdd { } }; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; template __global__ void UpdateToTensorKernel(const T* selected_rows, @@ -343,14 +345,14 @@ __global__ void UpdateToTensorKernel(const T* selected_rows, } template -struct UpdateToTensor { - framework::Tensor operator()(const platform::GPUDeviceContext& context, - const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2) { +struct UpdateToTensor { + void operator()(const platform::CUDADeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { // NOTE: Use SelectedRowsAddToTensor for better performance // no additional MergeAdd called. - auto merged_in1 = MergeAdd()(context, input1); + MergeAdd merge_func; + auto merged_in1 = merge_func(context, input1); auto in1_height = merged_in1.height(); auto in2_dims = input2->dims(); @@ -362,14 +364,14 @@ struct UpdateToTensor { int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); - auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); + auto* in1_data = in1_value.template data(); + auto* in2_data = input2->data(); - dim3 threads(PADDLE_CUDA_NUM_THREADS, 1); + dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 grid(1, in1_rows.size()); - UpdateToTensorKernel< - T, PADDLE_CUDA_NUM_THREADS><<>>( - in1_data, in1_rows.data(), op, in2_data, in1_row_numel); + UpdateToTensorKernel<<< + grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op, + in2_data, in1_row_numel); } }; } // namespace scatter diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index d4bef72980f83..09d4631905f90 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -123,10 +123,9 @@ enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; // out = seleted_rows_in / tensor template struct UpdateToTensor { - framework::Tensor operator()(const DeviceContext& context, - const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2); + void operator()(const DeviceContext& context, const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2); }; } // namespace scatter From 903d5609c61046cfa37280af5506ca21e350b852 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 29 Dec 2017 14:11:37 +0800 Subject: [PATCH 8/8] follow comment1 --- paddle/operators/adam_op.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 3c4148ccc0a7d..9cc34bdded780 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -124,19 +124,20 @@ struct SparseAdamFunctor { row_numel_(row_numel) {} inline HOSTDEVICE void operator()(size_t i) const { + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; for (int64_t j = 0; j < row_numel_; ++j) { T g = grad_[i * row_numel_ + j]; T mom1 = moment1_[rows_[i] * row_numel_ + j]; T mom2 = moment2_[rows_[i] * row_numel_ + j]; T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; T p = param_[rows_[i] * row_numel_ + j]; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + moment1_out_[rows_[i] * row_numel_ + j] = mom1; moment2_out_[rows_[i] * row_numel_ + j] = mom2; param_out_[rows_[i] * row_numel_ + j] = p;