From c0d2ca54b9bfea943c61ae09573ee188e0e1042b Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Fri, 3 Nov 2017 19:12:32 -0700 Subject: [PATCH] polish_g_to_l (#5367) --- paddle/operators/gather_op.cc | 23 ++++++- paddle/operators/gaussian_random_op.cc | 34 ++++++++--- paddle/operators/gru_unit_op.cc | 39 ++++++------ paddle/operators/huber_loss_op.cc | 6 +- paddle/operators/increment_op.cc | 12 ++-- paddle/operators/l1_norm_op.cc | 2 +- paddle/operators/load_op.cc | 12 ++-- paddle/operators/lookup_table_op.cc | 26 +++++--- paddle/operators/lrn_op.cc | 84 +++++++++++++------------- paddle/operators/lstm_op.cc | 65 ++++++++++---------- paddle/operators/lstm_unit_op.cc | 19 +++--- 11 files changed, 187 insertions(+), 135 deletions(-) diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc index f6c7f472da24a..aee672500ee5d 100644 --- a/paddle/operators/gather_op.cc +++ b/paddle/operators/gather_op.cc @@ -67,11 +67,28 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The source input of gather op"); AddInput("Index", "The index input of gather op"); - AddOutput("Out", "The output of add op"); + AddOutput("Out", "The output of gather op"); AddComment(R"DOC( -Gather Operator by selecting from the first axis, +Gather Operator. + +$Out = X[Index]$ + +Out is obtained by gathering entries of the outer-most dimension +of X indexed by Index and concatenate them together. + +Example: + +X = [[1, 2], + [3, 4], + [5, 6]] + +Index = [[1, 2]] + +Then: + +Out = [[3, 4], + [5, 6]] -Out = X[Index] )DOC"); } }; diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc index be7f542a7a274..802c98ae764d0 100644 --- a/paddle/operators/gaussian_random_op.cc +++ b/paddle/operators/gaussian_random_op.cc @@ -68,21 +68,35 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { GaussianRandomOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : framework::OpProtoAndCheckerMaker(proto, op_checker) { - AddOutput("Out", "output matrix of random op"); - AddComment(R"DOC( -GaussianRandom operator. -Use to initialize tensor with gaussian random generator. -)DOC"); + AddOutput("Out", "Output matrix of gaussian random op"); - AddAttr>("shape", "The dimension of random tensor."); - AddAttr("mean", "mean of random tensor.").SetDefault(.0f); - AddAttr("std", "std of random tensor.").SetDefault(1.0f); + AddAttr>("shape", + "(vector) " + "The dimension of random tensor."); + AddAttr("mean", + "(float, default 0.0) " + "mean of random tensor.") + .SetDefault(.0f); + AddAttr("std", + "(float, default 1.0) " + "std of random tensor.") + .SetDefault(1.0f); AddAttr("seed", + "(int, default 0) " "Random seed of generator." - "0 means use system wide seed") + "0 means use system wide seed.") .SetDefault(0); - AddAttr("data_type", "output data type") + AddAttr("data_type", + "(int, default 5(FP32)) " + "Output data type.") .SetDefault(framework::DataType::FP32); + + AddComment(R"DOC( +GaussianRandom Operator. + +Used to initialize tensors with gaussian random generator. + +)DOC"); } }; diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc index 8d9723289d9af..89c027ff1eea9 100644 --- a/paddle/operators/gru_unit_op.cc +++ b/paddle/operators/gru_unit_op.cc @@ -80,19 +80,21 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("HiddenPrev", "(Tensor) Matrix with shape [batch_size, frame_size] for the " "states of previous time step."); - AddInput("Weight", - "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. " - "The elements continuous in memory can be divided into two parts. " - "The first part are weights of the update gate and reset gate " - "with shape [frame_size, frame_size * 2], and the second part are " - "weights of output candidate with shape [frame_size, frame_size]"); - AddInput("Bias", - "(Tensor) Bias vector with shape [1, frame_size * 3] concating " - "bias of the update gate, reset gate and output candidate.") + AddInput( + "Weight", + "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. " + "The elements continuous in memory can be divided into two parts. " + "The first part are weights of the update gate and reset gate " + "with shape [frame_size, frame_size * 2], and the second part are " + "weights of output candidate with shape [frame_size, frame_size]."); + AddInput( + "Bias", + "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating " + "bias of the update gate, reset gate and output candidate.") .AsDispensable(); AddOutput("Gate", "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the " - "output of update gate, reset gate and output candidate") + "output of update gate, reset gate and output candidate.") .AsIntermediate(); AddOutput("ResetHiddenPrev", "(Tensor) Matrix with shape [batch_size, frame_size] for the " @@ -112,16 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(sigmoid) .InEnum({identity, sigmoid, tanh, relu}); AddComment(R"DOC( -GRUUnitOp implements part calculations of the GRU unit as following: +GRUUnit Operator. -\f[ -update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\ -reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r) \\ -output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\ -output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev) -\f] +This operator implements partial calculations of the GRU unit as follows: + +$$ +update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\ +reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r) \\ +output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\ +output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev}) +$$ The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp. + )DOC"); } }; diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc index 2d9449f5ca50d..3435e74b0afb4 100644 --- a/paddle/operators/huber_loss_op.cc +++ b/paddle/operators/huber_loss_op.cc @@ -59,10 +59,12 @@ class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker { "The shape is same as Input(X) and will be reused in backward.") .AsIntermediate(); AddOutput("Out", - "The output tensor with shape [batch_size, 1] which represents " - "the huber loss."); + "The output tensor with shape [batch_size, 1] " + "which represents the huber loss."); AddAttr("delta", "Hyper parameter in huber loss."); AddComment(R"DOC( +HuberLoss Operator. + Huber loss is a loss function used in robust regression. We define X as the input value and Y as the target value. Huber loss can evaluate the fitness of X to Y. Different from MSE loss, Huber loss is more robust for outliers. The diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc index 139392c691e00..c3e9308fe0ad6 100644 --- a/paddle/operators/increment_op.cc +++ b/paddle/operators/increment_op.cc @@ -39,14 +39,18 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "(Tensor) The input tensor of increment operator"); AddOutput("Out", "(Tensor) The output tensor of increment operator."); - AddComment(R"DOC(Increment operator - -The equation is: Out = X + step -)DOC"); AddAttr("step", + "(float, default 1.0) " "The step size by which the " "input tensor will be incremented.") .SetDefault(1.0); + AddComment(R"DOC( +Increment Operator. + +The equation is: +$$Out = X + step$$ + +)DOC"); } }; diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc index 1d111696cf43d..02ebf022968e9 100644 --- a/paddle/operators/l1_norm_op.cc +++ b/paddle/operators/l1_norm_op.cc @@ -57,7 +57,7 @@ L1 Norm Operator. Computes the L1 norm of a tensor. -Out = sum (abs(X)) +$$Out = \sum{|X|}$$ )DOC"); } diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 2d4eff0c35af5..b71a33a6b1ce8 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -115,14 +115,18 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { LoadOpProtoMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddOutput("Out", "The tensor need to be loaded"); - AddComment(R"DOC(Load Operator -Load operator will load a tensor variable from disk file. -)DOC"); + AddOutput("Out", "(Tensor) The tensor need to be loaded"); AddAttr("file_path", + "(string) " "Variable will be loaded from \"file_path\".") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); + AddComment(R"DOC( +Load Operator. + +Load operator will load a tensor variable from disk file. + +)DOC"); } }; } // namespace operators diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc index 0b361e20f2037..2163c8ce4e5d7 100644 --- a/paddle/operators/lookup_table_op.cc +++ b/paddle/operators/lookup_table_op.cc @@ -53,21 +53,27 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("W", - "An input represents embedding tensors," - " which is a learnable parameter."); + "An input represents embedding tensors, " + "which is a learnable parameter."); AddInput("Ids", - "An input with type int32 or int64" - "contains the ids to be looked up in W." - "Ids must be a column vector with rank = 2." - "The 2nd dimension size must be 1"); - AddOutput("Out", "The lookup results, which have the same type with W."); - AddAttr("is_sparse", "Sparse update").SetDefault(false); + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "Ids must be a column vector with rank = 2. " + "The 2nd dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update") + .SetDefault(false); AddComment(R"DOC( +Lookup Table Operator. + This operator is used to perform lookups on the parameter W, then concatenated into a dense tensor. -The input `Ids` can carry the LoD (Level of Details) information, -or not. And the output only shares the LoD with input `Ids`. +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + )DOC"); } }; diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc index 89ea6bfdbd9b7..00392b7967d02 100644 --- a/paddle/operators/lrn_op.cc +++ b/paddle/operators/lrn_op.cc @@ -45,72 +45,70 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { public: LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", R"DOC( - (Tensor) The input of LRN operator. It must be a 4D tenor with NCHW format. - )DOC"); - + AddInput("X", + "(Tensor) The input of LRN operator. " + "It must be a 4D tenor with NCHW format."); AddOutput("Out", "(Tensor) The output of LRN operator, which is also the 4D " "tensor with NCHW format."); - AddOutput("MidOut", R"Doc( -(Tensor)Middle result of lrn op.It's computed in forward process -and also used in backward process. - )Doc"); - - AddAttr("n", R"DOC( -(int, default 5)n is “adjacent” kernel maps at the same spatial position. - )DOC") + AddOutput("MidOut", + "(Tensor) Middle result of LRN operator. It's computed in " + "forward process and also used in backward process."); + + AddAttr("n", + "(int default 5) " + "n is the \"adjacent\" kernel that maps " + "at the same spatial position.") .SetDefault(5) .GreaterThan(0); - AddAttr("k", R"DOC( -(float, default 2.0)k is the bias. - )DOC") + AddAttr("k", + "(float, default 2.0) " + "k is the bias.") .SetDefault(2.0) .GreaterThan(0.0); - AddAttr("alpha", R"DOC( -(float, default 0.0001)alpha is the scale number. - )DOC") + AddAttr("alpha", + "(float, default 0.0001) " + "alpha is the scale number.") .SetDefault(0.0001) .GreaterThan(0.0); - AddAttr("beta", R"DOC( -(float, default 0.75)beta is the power number. - )DOC") + AddAttr("beta", + "(float, default 0.75) " + "beta is the power number.") .SetDefault(0.75) .GreaterThan(0.0); AddComment(R"DOC( - Local Response Normalization. - - This Function comes from the paper - "ImageNet Classification with Deep Convolutional Neural Networks". +Local Response Normalization Operator. - The original formula is: +This operator comes from the paper +"ImageNet Classification with Deep Convolutional Neural Networks". - Input(i, x, y) - Output(i, x, y) = ---------------------------------------------- - -- upper - (k + alpha * > (Input(j, x, y))^2) ^ (beta) - -- j = lower +The original formula is: - upper is `min(C, c + n/2)` - lower if `max(0, c - n/2)` +$$ +Output(i, x, y) = Input(i, x, y) / \left( +k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)} +(Input(j, x, y))^2 +\right)^{\beta} +$$ - Function implementation: +Function implementation: - inputs and outpus is NCHW format, while input.shape.ndims() is equal 4. - And the meaning of each dimension(0-3) is respectively batch size, - feature maps, rows and columns. +Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4. +And dimensions 0 ~ 3 represent batch size, feature maps, rows, +and columns, respectively. - Input and Output in the above formula is for each map(i) of one image, and - Input(i, x, y), Output(i, x, y) represents an element in an image. +Input and Output in the formula above is for each map(i) of one image, and +Input(i, x, y), Output(i, x, y) represents an element in an image. - C is the number of feature maps of one image, and n is a hyper-parameters - is configured when Function is initialized. The sum in the denominator - is the sum of the same position in the neighboring maps. - )DOC"); +C is the number of feature maps of one image. n is a hyper-parameter +configured when operator is initialized. The sum in the denominator +is the sum of the same positions in the neighboring maps. + +)DOC"); } }; diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index 94342d940704d..fdf52cf424d1b 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -103,7 +103,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("H0", "(Tensor, optional) the initial hidden state is an optional " "input. This is a tensor with shape (N x D), where N is the " - "batch size, D is the hidden size.") + "batch size and D is the hidden size.") .AsDispensable(); AddInput("C0", "(Tensor, optional) the initial cell state is an optional " @@ -134,85 +134,82 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("BatchGate", "(LoDTensor) This LoDTensor contains input gate, forget gate " "and output gate after the nonlinear computation. This " - "LoDTensor has the same shape with the reorganized input, which " + "LoDTensor has the same shape as the reorganized input, which " "is also be called batch input. The LoD size is 2. The first " "LoD is the batch offsets and the second LoD contains the " "indexes, which denote the position of reorganized sequence " "in the raw input.") .AsIntermediate(); AddOutput("BatchCellPreAct", - "(LoDTensor) This LoDTensor is got in the forward and used " + "(LoDTensor) This LoDTensor is obtained in the forward and used " "in the backward.") .AsIntermediate(); AddAttr("usePeepholes", - "(bool, defalut: True) " + "(bool, default True) " "whether to enable diagonal/peephole connections.") .SetDefault(true); AddAttr("isReverse", - "(bool, defalut: False) " + "(bool, default False) " "whether to compute reversed LSTM.") .SetDefault(false); AddAttr( "gateActivation", - "(string, default: sigmoid)" + "(string, default sigmoid)" "The activation for input gate, forget gate and output " "gate, `sigmoid` by default.") .SetDefault("sigmoid"); AddAttr("cellActivation", - "(string, default: tanh)" + "(string, default tanh)" "The activation for cell output, `tanh` by defalut.") .SetDefault("tanh"); AddAttr("candidateActivation", - "(string, default: tanh)" + "(string, default tanh)" "The activation for candidate hidden state, " "`tanh` by default.") .SetDefault("tanh"); - AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator + AddComment(R"DOC( +Long-Short Term Memory (LSTM) Operator. -The defalut implementation is diagonal/peephole connection [1], the formula is -as follows +The defalut implementation is diagonal/peephole connection +(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: - i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) +$$ +i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\ - f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) +f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\ - \tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) +\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\ - o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) +o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\ - c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t} +c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ - h_t = o_t ⊙ act_h(c_t) +h_t = o_t \odot act_h(c_t) +$$ where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$ -are diagonal weight matrices for peephole connections. In our implenmention, -We use vectors to reprenset these diagonal weight matrices. The b terms +are diagonal weight matrices for peephole connections. In our implementation, +we use vectors to reprenset these diagonal weight matrices. The b terms denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$ -is the non-line actications, such as logistic sigmoid function, and -\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate, -output gate and cell activation vectors, all of which are the same size as +is the non-line activations, such as logistic sigmoid function, and +\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate, +and cell activation vectors, respectively, all of which have the same size as the cell output activation vector \f$h\f$. -The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$ -are the cell input and cell output activation functions, `tanh` is usually +The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$ +are the cell input and cell output activation functions and `tanh` is usually used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state, which is computed based on the current input and the previous hidden state. -Set `usePeepholes` False to disable peephole connection [2]. The formula +Set usePeepholes False to disable peephole connection +(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula is omitted here. -@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ -operations on the input x_{t} were NOT included in this operator. +Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ +operations on the input \f$x_{t}\f$ are NOT included in this operator. Users can choose to use fully-connect operator before LSTM operator. -[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory -recurrent neural network architectures for large scale acoustic modeling. -INTERSPEECH, 2014. - -[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory. -Neural Computation, 9(8):1735-1780, 1997. - )DOC"); } }; diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc index 5d63017208a55..f4519ec16f3f6 100644 --- a/paddle/operators/lstm_unit_op.cc +++ b/paddle/operators/lstm_unit_op.cc @@ -57,17 +57,22 @@ class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker { "The cell state tensor of last time-step in the Lstm Unit operator."); AddOutput("C", "The cell tensor of Lstm Unit operator."); AddOutput("H", "The hidden state tensor of Lstm Unit operator."); - - AddComment(R"DOC(Lstm-Unit Operator + AddAttr("forget_bias", + "(float, default 0.0) " + "The forget bias of Lstm Unit.") + .SetDefault(0.0); + AddComment(R"DOC( +Lstm Unit Operator Equation: - i, f, o, j = split(X) - C = C_prev * sigm(f + forget_bias) + sigm(i) * tanh(j) - H = C * sigm(o) + +$$ +i, f, o, j = split(X) \\ +C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\ +H = C * sigm(o) +$$ )DOC"); - AddAttr("forget_bias", "The forget bias of Lstm Unit.") - .SetDefault(0.0); } };