diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst index 0ab75a130686f8..343f55a20e464f 100644 --- a/doc/algorithm/rnn/rnn.rst +++ b/doc/algorithm/rnn/rnn.rst @@ -142,6 +142,7 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: .. code-block:: python + group_inputs=[StaticInput(input=encoded_vector,is_seq=True), StaticInput(input=encoded_proj,is_seq=True)] trg_embedding = embedding_layer( diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst index 3c683914f4d149..b487fec64c4ebb 100644 --- a/doc/ui/api/trainer_config_helpers/optimizers.rst +++ b/doc/ui/api/trainer_config_helpers/optimizers.rst @@ -4,6 +4,12 @@ BaseSGDOptimizer :members: BaseSGDOptimizer :noindex: +MomentumOptimizer +================= +.. automodule:: paddle.trainer_config_helpers.optimizers + :members: MomentumOptimizer + :noindex: + AdamOptimizer ============= .. automodule:: paddle.trainer_config_helpers.optimizers diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index aabf44d6512004..60c129f4e2386e 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -47,7 +47,7 @@ add_test(NAME test_CompareTwoOpts COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf - --num_passes=1 --need_high_accuracy=1 + --num_passes=1 --need_high_accuracy=0 WORKING_DIRECTORY ${PROJ_ROOT}/paddle/) ################# test_CompareSparse ################## diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list new file mode 100644 index 00000000000000..703e87753d5a4f --- /dev/null +++ b/paddle/trainer/tests/mnist.list @@ -0,0 +1 @@ +trainer/tests/mnist_bin_part diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part new file mode 100644 index 00000000000000..08b93a0ebb5698 Binary files /dev/null and b/paddle/trainer/tests/mnist_bin_part differ diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf index 61d2c62d4296a3..f5b1988ddaf5fb 100644 --- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf +++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf @@ -12,32 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. +from paddle.trainer_config_helpers import * ################################### Data Configuration ################################### -TrainData(ProtoData(files = "train.list")) +TrainData(ProtoData(files = "trainer/tests/mnist.list")) ################################### Algorithm Configuration ################################### -Settings( - learning_rate_decay_a = 0.0, - learning_rate_decay_b = 0.0, - learning_rate = 1e-03, - batch_size = 1000, - algorithm = 'sgd', - num_batches_per_send_parameter = 1, - num_batches_per_get_parameter = 1, - learning_method='sparse_momentum', -) -default_momentum(0.5) +settings(batch_size = 1000, + learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) ################################### Network Configuration ################################### -Layer(type = "data", name = "input", size = 784) -Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800) -Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800) -#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10) -#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10) -Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10) -Layer(type = "data", name = "label", size = 1) -Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost") -#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost") -Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error") -Inputs("input", "label") -Outputs("cost") +data = data_layer(name ="input", size=784) + +fc1 = fc_layer(input=data, size=800, + bias_attr=True, + act=SigmoidActivation()) + +fc2 = fc_layer(input=fc1, size=800, + bias_attr=True, + act=SigmoidActivation()) + +output = fc_layer(input=[fc1, fc2], size=10, + bias_attr=True, + act=SoftmaxActivation()) + +lbl = data_layer(name ="label", size=1) + +cost = classification_cost(input=output, label=lbl) +outputs(cost) diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf index 82d547dd8a0d05..f5b1988ddaf5fb 100644 --- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf +++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf @@ -12,32 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. +from paddle.trainer_config_helpers import * ################################### Data Configuration ################################### -TrainData(ProtoData(files = "train.list")) +TrainData(ProtoData(files = "trainer/tests/mnist.list")) ################################### Algorithm Configuration ################################### -Settings( - learning_rate_decay_a = 0.0, - learning_rate_decay_b = 0.0, - learning_rate = 1e-03, - batch_size = 1000, - algorithm = 'sgd', - num_batches_per_send_parameter = 1, - num_batches_per_get_parameter = 1, - learning_method='momentum', -) -default_momentum(0.5) +settings(batch_size = 1000, + learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) ################################### Network Configuration ################################### -Layer(type = "data", name = "input", size = 784) -Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800) -Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800) -#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10) -#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10) -Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10) -Layer(type = "data", name = "label", size = 1) -Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost") -#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost") -Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error") -Inputs("input", "label") -Outputs("cost") +data = data_layer(name ="input", size=784) + +fc1 = fc_layer(input=data, size=800, + bias_attr=True, + act=SigmoidActivation()) + +fc2 = fc_layer(input=fc1, size=800, + bias_attr=True, + act=SigmoidActivation()) + +output = fc_layer(input=[fc1, fc2], size=10, + bias_attr=True, + act=SoftmaxActivation()) + +lbl = data_layer(name ="label", size=1) + +cost = classification_cost(input=output, label=lbl) +outputs(cost) diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index af85f745f63e59..4660a6b5003daf 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -71,16 +71,41 @@ def to_setting_kwargs(self): class MomentumOptimizer(BaseSGDOptimizer): + """ + MomentumOptimizer. + + When sparse=True, the update scheme: + + .. math:: + + \\alpha_t &= \\alpha_{t-1} / k \\\\ + \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\ + u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\ + v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\ + \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t + + where :math:`k` is momentum, :math:`\\lambda` is decay rate, + :math:`\\gamma_t` is learning rate at the t'th step. + + :param sparse: with sparse support or not. + :type sparse: bool + """ def extra_settings(self): default_momentum(self.momentum) def to_setting_kwargs(self): - return { - 'learning_method': 'momentum' - } + if self.sparse: + return { + 'learning_method': 'sparse_momentum' + } + else: + return { + 'learning_method': 'momentum' + } - def __init__(self, momentum=None): + def __init__(self, momentum=None, sparse=False): self.momentum = momentum + self.sparse = sparse class AdamOptimizer(BaseSGDOptimizer):