Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance reduce op #10708

Merged
merged 11 commits into from
May 23, 2018
51 changes: 33 additions & 18 deletions paddle/fluid/operators/reduce_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#include "paddle/fluid/operators/reduce_op.h"

#include <algorithm>
#include <string>
#include <vector>

Expand All @@ -34,11 +35,14 @@ class ReduceOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
int dim = ctx->Attrs().Get<int>("dim");
if (dim < 0) dim = x_rank + dim;
PADDLE_ENFORCE_LT(
dim, x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) {
Expand All @@ -49,14 +53,22 @@ class ReduceOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", {1});
} else {
auto dims_vector = vectorize(x_dims);
if (keep_dim || x_rank == 1) {
dims_vector[dim] = 1;
if (keep_dim) {
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = 1;
}
} else {
dims_vector.erase(dims_vector.begin() + dim);
const int DEL_FLAG = -2;
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = DEL_FLAG;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), DEL_FLAG),
dims_vector.end());
}
auto out_dims = framework::make_ddim(dims_vector);
ctx->SetOutputDim("Out", out_dims);
if (dim != 0) {
if (dims[0] != 0) {
// Only pass LoD when not reducing on the first dim.
ctx->ShareLoD("X", /*->*/ "Out");
}
Expand All @@ -75,11 +87,14 @@ class ReduceGradOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X");
auto x_rank = x_dims.size();
PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
int dim = ctx->Attrs().Get<int>("dim");
if (dim < 0) dim = x_rank + dim;
PADDLE_ENFORCE_LT(
dim, x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
PADDLE_ENFORCE_LT(
dims[i], x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
}
sort(dims.begin(), dims.end());
auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
Expand All @@ -95,13 +110,13 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) The input tensor. Tensors with rank at most 6 are "
"supported.");
AddOutput("Out", "(Tensor) The result tensor.");
AddAttr<int>(
AddAttr<std::vector<int>>(
"dim",
"(int, default 0) The dimension to reduce. "
"(list<int>, default {0}) The dimensions to reduce. "
"Must be in the range [-rank(input), rank(input)). "
"If `dim < 0`, the dim to reduce is `rank + dim`. "
"If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
"Note that reducing on the first dim will make the LoD info lost.")
.SetDefault(0);
.SetDefault({0});
AddAttr<bool>("keep_dim",
"(bool, default false) "
"If true, retain the reduced dimension with length 1.")
Expand Down
99 changes: 59 additions & 40 deletions paddle/fluid/operators/reduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once

#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
Expand Down Expand Up @@ -109,6 +110,11 @@ struct ProdGradFunctor {
}
};

#define HANDLE_DIM(NDIM, RDIM) \
if (ndim == NDIM && rdim == RDIM) { \
ReduceCompute<NDIM, RDIM>(context); \
}

template <typename DeviceContext, typename T, typename Functor>
class ReduceKernel : public framework::OpKernel<T> {
public:
Expand All @@ -127,51 +133,56 @@ class ReduceKernel : public framework::OpKernel<T> {
Functor functor;
functor(place, &x, &out, reduce_dim);
} else {
int rank = context.Input<Tensor>("X")->dims().size();
switch (rank) {
case 1:
ReduceCompute<1>(context);
break;
case 2:
ReduceCompute<2>(context);
break;
case 3:
ReduceCompute<3>(context);
break;
case 4:
ReduceCompute<4>(context);
break;
case 5:
ReduceCompute<5>(context);
break;
case 6:
ReduceCompute<6>(context);
break;
}
int ndim = context.Input<Tensor>("X")->dims().size();
int rdim = context.Attr<std::vector<int>>("dim").size();
HANDLE_DIM(6, 5);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems we can simplify codes here by employing boost/preprocessor, please refer to https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/expand_op.h#L31, however, it will decrease the readability.

HANDLE_DIM(6, 4);
HANDLE_DIM(6, 3);
HANDLE_DIM(6, 2);
HANDLE_DIM(6, 1);
HANDLE_DIM(5, 4);
HANDLE_DIM(5, 3);
HANDLE_DIM(5, 2);
HANDLE_DIM(5, 1);
HANDLE_DIM(4, 3);
HANDLE_DIM(4, 2);
HANDLE_DIM(4, 1);
HANDLE_DIM(3, 2);
HANDLE_DIM(3, 1);
HANDLE_DIM(2, 1);
HANDLE_DIM(1, 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reduce_op现在编译特别慢,GPU模式下等了快5分钟才编译完这个op。请问是不是138行-153行支持的维度太多了呢?能否把维度5和6删去,这样就从15种减少到6种了。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@panyx0718 merge 个pr减少支持的纬度: #11113

}
}

private:
template <size_t D>
template <size_t D, size_t R_D>
void ReduceCompute(const framework::ExecutionContext& context) const {
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());

auto x = EigenTensor<T, D>::From(*input);
auto x_rank = static_cast<int>(x.dimensions().size());
int dim = static_cast<int>(context.Attr<int>("dim"));
if (dim < 0) dim = x_rank + dim;
auto reduce_dim = Eigen::array<int, 1>({{dim}});
auto dims = context.Attr<std::vector<int>>("dim");
auto reduce_dim = Eigen::array<int, R_D>();
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
reduce_dim[i] = dims[i];
}
// construct the squeezed output tensor
bool keep_dim = context.Attr<bool>("keep_dim");
DDim dims = output->dims();
auto dims_vector = vectorize(dims);
DDim out_dims = output->dims();
if (keep_dim && x_rank > 1) {
dims_vector.erase(dims_vector.begin() + dim);
dims = framework::make_ddim(dims_vector);
int DEL_FLAG = -2;
Copy link
Contributor

@qingqing01 qingqing01 May 23, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Thx.

auto dims_vector = vectorize(out_dims);
for (size_t i = 0; i < dims.size(); ++i) {
dims_vector[dims[i]] = DEL_FLAG;
}
dims_vector.erase(
remove(dims_vector.begin(), dims_vector.end(), DEL_FLAG),
dims_vector.end());
out_dims = framework::make_ddim(dims_vector);
}

auto& place =
*context.template device_context<DeviceContext>().eigen_device();
Functor functor;
Expand All @@ -180,7 +191,7 @@ class ReduceKernel : public framework::OpKernel<T> {
auto out = EigenScalar<T>::From(*output);
functor(place, &x, &out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
functor(place, &x, &out, reduce_dim);
}
}
Expand Down Expand Up @@ -245,21 +256,29 @@ class ReduceGradKernel : public framework::OpKernel<T> {
auto x = EigenTensor<T, D>::From(*input0);
auto x_grad = EigenTensor<T, D>::From(*output);
auto x_rank = static_cast<int>(x.dimensions().size());
int dim = static_cast<int>(context.Attr<int>("dim"));
if (dim < 0) dim = x_rank + dim;
DDim dims = input0->dims();
dims[dim] = 1;
auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);

auto dims = context.Attr<std::vector<int>>("dim");
auto x_dims = input0->dims();
auto reduced_dims_v = vectorize(x_dims);
Eigen::array<int, D> broadcast_dim;
for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
broadcast_dim[dim] = input0->dims()[dim];

int broad_cats_times = 1;
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) dims[i] = x_rank + dims[i];
reduced_dims_v[dims[i]] = 1;
broadcast_dim[dims[i]] = x_dims[dims[i]];
broad_cats_times *= x_dims[dims[i]];
}
auto reduced_dims = framework::make_ddim(reduced_dims_v);
auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims);
auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);

auto& place =
*context.template device_context<DeviceContext>().eigen_device();

Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broadcast_dim[dim]);
broad_cats_times);
}
};

Expand Down
Loading