diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 84d567330a797..079187e09c916 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -17,6 +17,7 @@ import sys from functools import partial, reduce +import paddle from . import nn from . import tensor from . import control_flow @@ -507,6 +508,9 @@ def append(self, x): self.array.append(x) return self + def __getitem__(self, item): + return self.array.__getitem__(item) + def _maybe_copy(state, new_state, step_mask): """update rnn state or just pass the old state through""" @@ -859,8 +863,6 @@ def tracks_own_finished(self): class BeamSearchDecoder(Decoder): """ - :api_attr: Static Graph - Decoder with beam search decoding strategy. It wraps a cell to get probabilities, and follows a beam search step to calculate scores and select candidate token ids for each decoding step. @@ -881,24 +883,20 @@ class BeamSearchDecoder(Decoder): .. code-block:: python - import paddle.fluid as fluid - from paddle.fluid.layers import GRUCell, BeamSearchDecoder - - trg_embeder = lambda x: fluid.embedding( - x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding")) - output_layer = lambda x: layers.fc(x, - size=10000, - num_flatten_dims=len(x.shape) - 1, - param_attr=fluid.ParamAttr(name= - "output_w"), - bias_attr=False) - decoder_cell = GRUCell(hidden_size=128) + import numpy as np + import paddle + from paddle.nn import BeamSearchDecoder, dynamic_decode + from paddle.nn import GRUCell, Linear, Embedding + trg_embeder = Embedding(100, 32) + output_layer = Linear(32, 32) + decoder_cell = GRUCell(input_size=32, hidden_size=32) decoder = BeamSearchDecoder(decoder_cell, start_token=0, end_token=1, beam_size=4, embedding_fn=trg_embeder, output_fn=output_layer) + """ def __init__(self, @@ -912,16 +910,13 @@ def __init__(self, Constructor of BeamSearchDecoder. Parameters: - cell(RNNCell): An instance of `RNNCell` or object with the same interface. + cell(RNNCellBase): An instance of `RNNCellBase` or object with the same interface. start_token(int): The start token id. end_token(int): The end token id. beam_size(int): The beam width used in beam search. embedding_fn(optional): A callable to apply to selected candidate ids. Mostly it is an embedding layer to transform ids to embeddings, and the returned value acts as the `input` argument for `cell.call`. - **Note that fluid.embedding should be used here rather than - fluid.layers.embedding, since shape of ids is [batch_size, beam_size]. - when using fluid.layers.embedding, must unsqueeze in embedding_fn.** If not provided, the id to embedding transformation must be built into `cell.call`. Default None. output_fn(optional): A callable to apply to the cell's output prior to @@ -1150,6 +1145,8 @@ def initialize(self, initial_cell_states): np.array( [[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")), [self.batch_size, 1]) + if paddle.get_default_dtype() == "float64": + log_probs = tensor.cast(log_probs, "float64") # TODO: remove the restriction of force_cpu init_finished = tensor.fill_constant_batch_size_like( input=state, @@ -1197,7 +1194,11 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state): shape=[1], dtype="int64", value=self.vocab_size) noend_array = [-self.kinf] * self.vocab_size noend_array[self.end_token] = 0 + self.noend_mask_tensor = tensor.assign(np.array(noend_array, "float32")) + if paddle.get_default_dtype() == "float64": + self.noend_mask_tensor = tensor.cast(self.noend_mask_tensor, + "float64") step_log_probs = nn.log(nn.softmax(logits)) step_log_probs = self._mask_probs(step_log_probs, beam_state.finished) @@ -1328,98 +1329,103 @@ def tracks_own_finished(self): return True -def dynamic_decode(decoder, - inits=None, - max_step_num=None, - output_time_major=False, - impute_finished=False, - is_test=False, - return_length=False, - **kwargs): - """ - :api_attr: Static Graph +def _dynamic_decode_imperative(decoder, + inits=None, + max_step_num=None, + output_time_major=False, + impute_finished=False, + is_test=False, + return_length=False, + **kwargs): + def _maybe_copy(state, new_state, step_mask): + # TODO: use where_op + state_dtype = state.dtype + if convert_dtype(state_dtype) in ["bool"]: + state = tensor.cast(state, dtype="float32") + new_state = tensor.cast(new_state, dtype="float32") + if step_mask.dtype != state.dtype: + step_mask = tensor.cast(step_mask, dtype=state.dtype) + # otherwise, renamed bool gradients of would be summed up leading + # to sum(bool) error. + step_mask.stop_gradient = True + new_state = nn.elementwise_mul( + state, step_mask, axis=0) - nn.elementwise_mul( + new_state, (step_mask - 1), axis=0) + if convert_dtype(state_dtype) in ["bool"]: + new_state = tensor.cast(new_state, dtype=state_dtype) + return new_state - Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned - Tensor indicating finished status contains all True values or the number of - decoding step reaches to :attr:`max_step_num`. + initial_inputs, initial_states, initial_finished = decoder.initialize(inits) + inputs, states, finished = (initial_inputs, initial_states, + initial_finished) + cond = control_flow.logical_not((nn.reduce_all(initial_finished))) + sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64") + outputs = None + + step_idx = 0 + step_idx_tensor = tensor.fill_constant( + shape=[1], dtype="int64", value=step_idx) + while cond.numpy(): + (step_outputs, next_states, next_inputs, next_finished) = decoder.step( + step_idx_tensor, inputs, states, **kwargs) + if not decoder.tracks_own_finished: + # BeamSearchDecoder would track it own finished, since + # beams would be reordered and the finished status of each + # entry might change. Otherwise, perform logical OR which + # would not change the already finished. + next_finished = control_flow.logical_or(next_finished, finished) + # To confirm states.finished/finished be consistent with + # next_finished. + tensor.assign(next_finished, finished) + next_sequence_lengths = nn.elementwise_add( + sequence_lengths, + tensor.cast( + control_flow.logical_not(finished), sequence_lengths.dtype)) - :code:`decoder.initialize()` would be called once before the decoding loop. - If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` - would be called once after the decoding loop. + if impute_finished: # rectify the states for the finished. + next_states = map_structure( + lambda x, y: _maybe_copy(x, y, finished), states, next_states) + outputs = map_structure( + lambda x: ArrayWrapper(x), + step_outputs) if step_idx == 0 else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, outputs) + inputs, states, finished, sequence_lengths = ( + next_inputs, next_states, next_finished, next_sequence_lengths) - Parameters: - decoder(Decoder): An instance of `Decoder`. - inits(object, optional): Argument passed to `decoder.initialize`. - Default `None`. - max_step_num(int, optional): The maximum number of steps. If not provided, - decode until the decoder is fully done, or in other words, the returned - Tensor by :code:`decoder.step()` indicating finished status contains - all True. Default `None`. - output_time_major(bool, optional): Indicate the data layout of Tensor included - in the final outputs(the first returned value of this method). If - attr:`False`, the data layout would be batch major with shape - `[batch_size, seq_len, ...]`. If attr:`True`, the data layout would - be time major with shape `[seq_len, batch_size, ...]`. Default: `False`. - impute_finished(bool, optional): If `True`, then states get copied through - for batch entries which are marked as finished, which differs with the - unfinished using the new states returned by :code:`decoder.step()` and - ensures that the final states have the correct values. Otherwise, states - wouldn't be copied through when finished. If the returned `final_states` - is needed, it should be set as True, which causes some slowdown. - Default `False`. - is_test(bool, optional): A flag indicating whether to use test mode. In - test mode, it is more memory saving. Default `False`. - return_length(bool, optional): A flag indicating whether to return an - extra Tensor variable in the output tuple, which stores the actual - lengths of all decoded sequences. Default `False`. - **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. + control_flow.increment(x=step_idx_tensor, value=1.0, in_place=True) + step_idx += 1 - Returns: - tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \ - when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \ - The final outputs and states, both are Tensor or nested structure of Tensor. \ - `final_outputs` has the same structure and data types as the :code:`outputs` \ - returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \ - is the stacked of all decoding steps' outputs, which might be revised \ - by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \ - `final_states` is the counterpart at last time step of initial states \ - returned by :code:`decoder.initialize()` , thus has the same structure \ - with it and has tensors with same shapes and data types. `sequence_lengths` \ - is an `int64` tensor with the same shape as `finished` returned \ - by :code:`decoder.initialize()` , and it stores the actual lengths of \ - all decoded sequences. - + control_flow.logical_not(nn.reduce_all(finished), cond) + if max_step_num is not None and step_idx > max_step_num: + break - Examples: + final_outputs = map_structure(lambda x: nn.stack(x.array, axis=0), outputs) + final_states = states - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid.layers as layers - from paddle.fluid.layers import GRUCell, BeamSearchDecoder, dynamic_decode + try: + final_outputs, final_states = decoder.finalize( + final_outputs, final_states, sequence_lengths) + except NotImplementedError: + pass - encoder_output = fluid.data(name="encoder_output", - shape=[-1, 32, 128], - dtype="float32") - trg_embeder = lambda x: fluid.embedding( - x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding")) - output_layer = lambda x: layers.fc(x, - size=10000, - num_flatten_dims=len(x.shape) - 1, - param_attr=fluid.ParamAttr(name= - "output_w"), - bias_attr=False) - decoder_cell = GRUCell(hidden_size=128) - decoder = BeamSearchDecoder(decoder_cell, - start_token=0, - end_token=1, - beam_size=4, - embedding_fn=trg_embeder, - output_fn=output_layer) + if not output_time_major: + final_outputs = map_structure( + lambda x: nn.transpose(x, [1, 0] + list(range(2, len(x.shape)))), + final_outputs) - outputs = dynamic_decode( - decoder=decoder, inits=decoder_cell.get_initial_states(encoder_output)) - """ + return (final_outputs, final_states, + sequence_lengths) if return_length else (final_outputs, + final_states) + + +def _dynamic_decode_declarative(decoder, + inits=None, + max_step_num=None, + output_time_major=False, + impute_finished=False, + is_test=False, + return_length=False, + **kwargs): initial_inputs, initial_states, initial_finished = decoder.initialize(inits) global_inputs, global_states, global_finished = ( initial_inputs, initial_states, initial_finished) @@ -1558,6 +1564,98 @@ def _create_array_out_of_while(dtype): final_states) +def dynamic_decode(decoder, + inits=None, + max_step_num=None, + output_time_major=False, + impute_finished=False, + is_test=False, + return_length=False, + **kwargs): + """ + Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned + Tensor indicating finished status contains all True values or the number of + decoding step reaches to :attr:`max_step_num`. + + :code:`decoder.initialize()` would be called once before the decoding loop. + If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` + would be called once after the decoding loop. + + Parameters: + decoder(Decoder): An instance of `Decoder`. + inits(object, optional): Argument passed to `decoder.initialize`. + Default `None`. + max_step_num(int, optional): The maximum number of steps. If not provided, + decode until the decoder is fully done, or in other words, the returned + Tensor by :code:`decoder.step()` indicating finished status contains + all True. Default `None`. + output_time_major(bool, optional): Indicate the data layout of Tensor included + in the final outputs(the first returned value of this method). If + attr:`False`, the data layout would be batch major with shape + `[batch_size, seq_len, ...]`. If attr:`True`, the data layout would + be time major with shape `[seq_len, batch_size, ...]`. Default: `False`. + impute_finished(bool, optional): If `True`, then states get copied through + for batch entries which are marked as finished, which differs with the + unfinished using the new states returned by :code:`decoder.step()` and + ensures that the final states have the correct values. Otherwise, states + wouldn't be copied through when finished. If the returned `final_states` + is needed, it should be set as True, which causes some slowdown. + Default `False`. + is_test(bool, optional): A flag indicating whether to use test mode. In + test mode, it is more memory saving. Default `False`. + return_length(bool, optional): A flag indicating whether to return an + extra Tensor variable in the output tuple, which stores the actual + lengths of all decoded sequences. Default `False`. + **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. + + Returns: + tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \ + when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \ + The final outputs and states, both are Tensor or nested structure of Tensor. \ + `final_outputs` has the same structure and data types as the :code:`outputs` \ + returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \ + is the stacked of all decoding steps' outputs, which might be revised \ + by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \ + `final_states` is the counterpart at last time step of initial states \ + returned by :code:`decoder.initialize()` , thus has the same structure \ + with it and has tensors with same shapes and data types. `sequence_lengths` \ + is an `int64` tensor with the same shape as `finished` returned \ + by :code:`decoder.initialize()` , and it stores the actual lengths of \ + all decoded sequences. + + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + from paddle.nn import BeamSearchDecoder, dynamic_decode + from paddle.nn import GRUCell, Linear, Embedding + trg_embeder = Embedding(100, 32) + output_layer = Linear(32, 32) + decoder_cell = GRUCell(input_size=32, hidden_size=32) + decoder = BeamSearchDecoder(decoder_cell, + start_token=0, + end_token=1, + beam_size=4, + embedding_fn=trg_embeder, + output_fn=output_layer) + encoder_output = paddle.ones((4, 8, 32), dtype=paddle.get_default_dtype()) + outputs = dynamic_decode(decoder=decoder, + inits=decoder_cell.get_initial_states(encoder_output), + max_step_num=10) + """ + if in_dygraph_mode(): + return _dynamic_decode_imperative(decoder, inits, max_step_num, + output_time_major, impute_finished, + is_test, return_length, **kwargs) + else: + return _dynamic_decode_declarative(decoder, inits, max_step_num, + output_time_major, impute_finished, + is_test, return_length, **kwargs) + + class DecodeHelper(object): """ DecodeHelper is the base class for any helper instance used in `BasicDecoder`. diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py index 6ca194b2694b6..066d0a37e1361 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,9 +14,17 @@ from __future__ import print_function +import random import unittest import numpy as np +import paddle +import paddle.nn as nn +from paddle import Model, set_device +from paddle.static import InputSpec as Input +from paddle.fluid.dygraph import Layer +from paddle.nn import BeamSearchDecoder, dynamic_decode + import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.core as core @@ -24,6 +32,8 @@ from paddle.fluid.executor import Executor from paddle.fluid import framework +paddle.enable_static() + class EncoderCell(layers.RNNCell): def __init__(self, num_layers, hidden_size, dropout_prob=0.): @@ -436,6 +446,7 @@ def setUp(self): self.exe = Executor(place) def test_mle_train(self): + paddle.enable_static() self.model_hparams["decoding_strategy"] = "train_greedy" agent = SeqPGAgent( model_cls=Seq2SeqModel, @@ -468,6 +479,7 @@ def test_mle_train(self): (iter_idx, reward.mean(), cost)) def test_greedy_train(self): + paddle.enable_static() self.model_hparams["decoding_strategy"] = "infer_greedy" agent = SeqPGAgent( model_cls=Seq2SeqModel, @@ -493,6 +505,7 @@ def test_greedy_train(self): (iter_idx, reward.mean(), cost)) def test_sample_train(self): + paddle.enable_static() self.model_hparams["decoding_strategy"] = "infer_sample" agent = SeqPGAgent( model_cls=Seq2SeqModel, @@ -518,6 +531,8 @@ def test_sample_train(self): (iter_idx, reward.mean(), cost)) def test_beam_search_infer(self): + paddle.set_default_dtype("float32") + paddle.enable_static() self.model_hparams["decoding_strategy"] = "beam_search" main_program = fluid.Program() startup_program = fluid.Program() @@ -542,5 +557,154 @@ def test_beam_search_infer(self): fetch_list=[output])[0] +class ModuleApiTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._np_rand_state = np.random.get_state() + cls._py_rand_state = random.getstate() + cls._random_seed = 123 + np.random.seed(cls._random_seed) + random.seed(cls._random_seed) + + cls.model_cls = type(cls.__name__ + "Model", (Layer, ), { + "__init__": cls.model_init_wrapper(cls.model_init), + "forward": cls.model_forward + }) + + @classmethod + def tearDownClass(cls): + np.random.set_state(cls._np_rand_state) + random.setstate(cls._py_rand_state) + + @staticmethod + def model_init_wrapper(func): + def __impl__(self, *args, **kwargs): + Layer.__init__(self) + func(self, *args, **kwargs) + + return __impl__ + + @staticmethod + def model_init(model, *args, **kwargs): + raise NotImplementedError( + "model_init acts as `Model.__init__`, thus must implement it") + + @staticmethod + def model_forward(model, *args, **kwargs): + return model.module(*args, **kwargs) + + def make_inputs(self): + # TODO(guosheng): add default from `self.inputs` + raise NotImplementedError( + "model_inputs makes inputs for model, thus must implement it") + + def setUp(self): + """ + For the model which wraps the module to be tested: + Set input data by `self.inputs` list + Set init argument values by `self.attrs` list/dict + Set model parameter values by `self.param_states` dict + Set expected output data by `self.outputs` list + We can create a model instance and run once with these. + """ + self.inputs = [] + self.attrs = {} + self.param_states = {} + self.outputs = [] + + def _calc_output(self, place, mode="test", dygraph=True): + if dygraph: + fluid.enable_dygraph(place) + else: + fluid.disable_dygraph() + gen = paddle.manual_seed(self._random_seed) + gen._is_init_py = False + paddle.framework.random._manual_program_seed(self._random_seed) + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + layer = self.model_cls(**self.attrs) if isinstance( + self.attrs, dict) else self.model_cls(*self.attrs) + model = Model(layer, inputs=self.make_inputs()) + model.prepare() + if self.param_states: + model.load(self.param_states, optim_state=None) + return model.test_batch(self.inputs) + + def check_output_with_place(self, place, mode="test"): + dygraph_output = self._calc_output(place, mode, dygraph=True) + stgraph_output = self._calc_output(place, mode, dygraph=False) + expect_output = getattr(self, "outputs", None) + for actual_t, expect_t in zip(dygraph_output, stgraph_output): + self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0)) + if expect_output: + for actual_t, expect_t in zip(dygraph_output, expect_output): + self.assertTrue( + np.allclose( + actual_t, expect_t, rtol=1e-5, atol=0)) + + def check_output(self): + devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"] + for device in devices: + place = set_device(device) + self.check_output_with_place(place) + + +class TestBeamSearch(ModuleApiTest): + def setUp(self): + paddle.set_default_dtype("float64") + shape = (8, 32) + self.inputs = [ + np.random.random(shape).astype("float64"), + np.random.random(shape).astype("float64") + ] + self.outputs = None + self.attrs = { + "vocab_size": 100, + "embed_dim": 32, + "hidden_size": 32, + } + self.param_states = {} + + @staticmethod + def model_init(self, + vocab_size, + embed_dim, + hidden_size, + bos_id=0, + eos_id=1, + beam_size=2, + max_step_num=2): + embedder = paddle.fluid.dygraph.Embedding( + size=[vocab_size, embed_dim], dtype="float64") + output_layer = nn.Linear(hidden_size, vocab_size) + cell = nn.LSTMCell(embed_dim, hidden_size) + self.max_step_num = max_step_num + self.beam_search_decoder = BeamSearchDecoder( + cell, + start_token=bos_id, + end_token=eos_id, + beam_size=beam_size, + embedding_fn=embedder, + output_fn=output_layer) + + @staticmethod + def model_forward(model, init_hidden, init_cell): + return dynamic_decode( + model.beam_search_decoder, [init_hidden, init_cell], + max_step_num=model.max_step_num, + impute_finished=True, + is_test=True)[0] + + def make_inputs(self): + inputs = [ + Input([None, self.inputs[0].shape[-1]], "float64", "init_hidden"), + Input([None, self.inputs[1].shape[-1]], "float64", "init_cell"), + ] + return inputs + + def test_check_output(self): + self.check_output() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index dd435f12e32c1..6af59465be47c 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -42,14 +42,11 @@ # from .control_flow import StaticRNN #DEFINE_ALIAS # from .control_flow import while_loop #DEFINE_ALIAS # from .control_flow import rnn #DEFINE_ALIAS -# from .decode import BeamSearchDecoder #DEFINE_ALIAS +from .decode import BeamSearchDecoder #DEFINE_ALIAS +from .decode import dynamic_decode #DEFINE_ALIAS # from .decode import Decoder #DEFINE_ALIAS -# from .decode import beam_search #DEFINE_ALIAS -# from .decode import beam_search_decode #DEFINE_ALIAS # from .decode import crf_decoding #DEFINE_ALIAS # from .decode import ctc_greedy_decoder #DEFINE_ALIAS -# from .decode import dynamic_decode #DEFINE_ALIAS -# from .decode import gather_tree #DEFINE_ALIAS # from .input import Input #DEFINE_ALIAS from .layer.activation import ELU #DEFINE_ALIAS from .layer.activation import GELU #DEFINE_ALIAS diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py new file mode 100644 index 0000000000000..bba5aba0da9ad --- /dev/null +++ b/python/paddle/nn/decode.py @@ -0,0 +1,21 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..fluid.layers import BeamSearchDecoder #DEFINE_ALIAS +from ..fluid.layers import dynamic_decode #DEFINE_ALIAS + +__all__ = [ + 'BeamSearchDecoder', + 'dynamic_decode', +] diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 574721bd2b056..5f9307845ae9d 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -216,3 +216,4 @@ # from .vision import yolov3_loss #DEFINE_ALIAS from .input import one_hot #DEFINE_ALIAS from .input import embedding #DEFINE_ALIAS +from ...fluid.layers import gather_tree diff --git a/python/paddle/tests/test_text.py b/python/paddle/tests/test_text.py deleted file mode 100644 index fa83b0cc6f340..0000000000000 --- a/python/paddle/tests/test_text.py +++ /dev/null @@ -1,696 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from __future__ import print_function - -import unittest -import random - -import numpy as np - -import paddle -import paddle.fluid as fluid -from paddle.fluid.dygraph import Embedding, Linear, Layer -from paddle.fluid.layers import BeamSearchDecoder -from paddle import Model, set_device -from paddle.static import InputSpec as Input -from paddle.text import * - -paddle.enable_static() - - -class ModuleApiTest(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls._np_rand_state = np.random.get_state() - cls._py_rand_state = random.getstate() - cls._random_seed = 123 - np.random.seed(cls._random_seed) - random.seed(cls._random_seed) - - cls.model_cls = type(cls.__name__ + "Model", (Layer, ), { - "__init__": cls.model_init_wrapper(cls.model_init), - "forward": cls.model_forward - }) - - @classmethod - def tearDownClass(cls): - np.random.set_state(cls._np_rand_state) - random.setstate(cls._py_rand_state) - - @staticmethod - def model_init_wrapper(func): - def __impl__(self, *args, **kwargs): - Layer.__init__(self) - func(self, *args, **kwargs) - - return __impl__ - - @staticmethod - def model_init(model, *args, **kwargs): - raise NotImplementedError( - "model_init acts as `Model.__init__`, thus must implement it") - - @staticmethod - def model_forward(model, *args, **kwargs): - return model.module(*args, **kwargs) - - def make_inputs(self): - # TODO(guosheng): add default from `self.inputs` - raise NotImplementedError( - "model_inputs makes inputs for model, thus must implement it") - - def setUp(self): - """ - For the model which wraps the module to be tested: - Set input data by `self.inputs` list - Set init argument values by `self.attrs` list/dict - Set model parameter values by `self.param_states` dict - Set expected output data by `self.outputs` list - We can create a model instance and run once with these. - """ - self.inputs = [] - self.attrs = {} - self.param_states = {} - self.outputs = [] - - def _calc_output(self, place, mode="test", dygraph=True): - if dygraph: - fluid.enable_dygraph(place) - else: - fluid.disable_dygraph() - gen = paddle.manual_seed(self._random_seed) - gen._is_init_py = False - paddle.framework.random._manual_program_seed(self._random_seed) - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - layer = self.model_cls(**self.attrs) if isinstance( - self.attrs, dict) else self.model_cls(*self.attrs) - model = Model(layer, inputs=self.make_inputs()) - model.prepare() - if self.param_states: - model.load(self.param_states, optim_state=None) - return model.test_batch(self.inputs) - - def check_output_with_place(self, place, mode="test"): - dygraph_output = self._calc_output(place, mode, dygraph=True) - stgraph_output = self._calc_output(place, mode, dygraph=False) - expect_output = getattr(self, "outputs", None) - for actual_t, expect_t in zip(dygraph_output, stgraph_output): - self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0)) - if expect_output: - for actual_t, expect_t in zip(dygraph_output, expect_output): - self.assertTrue( - np.allclose( - actual_t, expect_t, rtol=1e-5, atol=0)) - - def check_output(self): - devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"] - for device in devices: - place = set_device(device) - self.check_output_with_place(place) - - -class TestBasicLSTM(ModuleApiTest): - def setUp(self): - # TODO(guosheng): Change to big size. Currently bigger hidden size for - # LSTM would fail, the second static graph run might get diff output - # with others. - shape = (2, 4, 16) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 16, "hidden_size": 16} - self.param_states = {} - - @staticmethod - def model_init(model, input_size, hidden_size): - model.lstm = RNN(BasicLSTMCell( - input_size, - hidden_size, )) - - @staticmethod - def model_forward(model, inputs): - return model.lstm(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestBasicGRU(ModuleApiTest): - def setUp(self): - shape = (2, 4, 128) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 128, "hidden_size": 128} - self.param_states = {} - - @staticmethod - def model_init(model, input_size, hidden_size): - model.gru = RNN(BasicGRUCell(input_size, hidden_size)) - - @staticmethod - def model_forward(model, inputs): - return model.gru(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestBeamSearch(ModuleApiTest): - def setUp(self): - shape = (8, 32) - self.inputs = [ - np.random.random(shape).astype("float32"), - np.random.random(shape).astype("float32") - ] - self.outputs = None - self.attrs = { - "vocab_size": 100, - "embed_dim": 32, - "hidden_size": 32, - } - self.param_states = {} - - @staticmethod - def model_init(self, - vocab_size, - embed_dim, - hidden_size, - bos_id=0, - eos_id=1, - beam_size=4, - max_step_num=20): - embedder = Embedding(size=[vocab_size, embed_dim]) - output_layer = Linear(hidden_size, vocab_size) - cell = BasicLSTMCell(embed_dim, hidden_size) - decoder = BeamSearchDecoder( - cell, - start_token=bos_id, - end_token=eos_id, - beam_size=beam_size, - embedding_fn=embedder, - output_fn=output_layer) - self.beam_search_decoder = DynamicDecode( - decoder, max_step_num=max_step_num, is_test=True) - - @staticmethod - def model_forward(model, init_hidden, init_cell): - return model.beam_search_decoder([init_hidden, init_cell])[0] - - def make_inputs(self): - inputs = [ - Input([None, self.inputs[0].shape[-1]], "float32", "init_hidden"), - Input([None, self.inputs[1].shape[-1]], "float32", "init_cell"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestTransformerEncoder(ModuleApiTest): - def setUp(self): - self.inputs = [ - # encoder input: [batch_size, seq_len, hidden_size] - np.random.random([2, 4, 512]).astype("float32"), - # self attention bias: [batch_size, n_head, seq_len, seq_len] - np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9 - ] - self.outputs = None - self.attrs = { - "n_layer": 2, - "n_head": 8, - "d_key": 64, - "d_value": 64, - "d_model": 512, - "d_inner_hid": 1024 - } - self.param_states = {} - - @staticmethod - def model_init(model, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - ffn_fc1_act="relu"): - model.encoder = TransformerEncoder( - n_layer, n_head, d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd, ffn_fc1_act) - - @staticmethod - def model_forward(model, enc_input, attn_bias): - return model.encoder(enc_input, attn_bias) - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[0].shape[-1]], "float32", - "enc_input"), - Input([None, self.inputs[1].shape[1], None, None], "float32", - "attn_bias"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestTransformerDecoder(TestTransformerEncoder): - def setUp(self): - self.inputs = [ - # decoder input: [batch_size, seq_len, hidden_size] - np.random.random([2, 4, 512]).astype("float32"), - # encoder output: [batch_size, seq_len, hidden_size] - np.random.random([2, 5, 512]).astype("float32"), - # self attention bias: [batch_size, n_head, seq_len, seq_len] - np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9, - # cross attention bias: [batch_size, n_head, seq_len, seq_len] - np.random.randint(0, 1, [2, 8, 4, 5]).astype("float32") * -1e9 - ] - self.outputs = None - self.attrs = { - "n_layer": 2, - "n_head": 8, - "d_key": 64, - "d_value": 64, - "d_model": 512, - "d_inner_hid": 1024 - } - self.param_states = {} - - @staticmethod - def model_init(model, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da"): - model.decoder = TransformerDecoder( - n_layer, n_head, d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd) - - @staticmethod - def model_forward(model, - dec_input, - enc_output, - self_attn_bias, - cross_attn_bias, - caches=None): - return model.decoder(dec_input, enc_output, self_attn_bias, - cross_attn_bias, caches) - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[0].shape[-1]], "float32", - "dec_input"), - Input([None, None, self.inputs[0].shape[-1]], "float32", - "enc_output"), - Input([None, self.inputs[-1].shape[1], None, None], "float32", - "self_attn_bias"), - Input([None, self.inputs[-1].shape[1], None, None], "float32", - "cross_attn_bias"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestTransformerBeamSearchDecoder(ModuleApiTest): - def setUp(self): - self.inputs = [ - # encoder output: [batch_size, seq_len, hidden_size] - np.random.random([2, 5, 128]).astype("float32"), - # cross attention bias: [batch_size, n_head, seq_len, seq_len] - np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9 - ] - self.outputs = None - self.attrs = { - "vocab_size": 100, - "n_layer": 2, - "n_head": 2, - "d_key": 64, - "d_value": 64, - "d_model": 128, - "d_inner_hid": 128 - } - self.param_states = {} - - @staticmethod - def model_init(model, - vocab_size, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - bos_id=0, - eos_id=1, - beam_size=4, - max_step_num=20): - model.beam_size = beam_size - - def embeder_init(self, size): - Layer.__init__(self) - self.embedder = Embedding(size) - - Embedder = type("Embedder", (Layer, ), { - "__init__": embeder_init, - "forward": lambda self, word, pos: self.embedder(word) - }) - embedder = Embedder(size=[vocab_size, d_model]) - output_layer = Linear(d_model, vocab_size) - model.decoder = TransformerDecoder( - n_layer, n_head, d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd) - transformer_cell = TransformerCell(model.decoder, embedder, - output_layer) - model.beam_search_decoder = DynamicDecode( - TransformerBeamSearchDecoder( - transformer_cell, bos_id, eos_id, beam_size, - var_dim_in_state=2), - max_step_num, - is_test=True) - - @staticmethod - def model_forward(model, enc_output, trg_src_attn_bias): - caches = model.decoder.prepare_incremental_cache(enc_output) - enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - enc_output, model.beam_size) - trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, model.beam_size) - static_caches = model.decoder.prepare_static_cache(enc_output) - rs, _ = model.beam_search_decoder( - inits=caches, - enc_output=enc_output, - trg_src_attn_bias=trg_src_attn_bias, - static_caches=static_caches) - return rs - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[0].shape[-1]], "float32", - "enc_output"), - Input([None, self.inputs[1].shape[1], None, None], "float32", - "trg_src_attn_bias"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestSequenceTagging(ModuleApiTest): - def setUp(self): - self.inputs = [ - np.random.randint(0, 100, (2, 8)).astype("int64"), - np.random.randint(1, 8, (2)).astype("int64"), - np.random.randint(0, 5, (2, 8)).astype("int64") - ] - self.outputs = None - self.attrs = {"vocab_size": 100, "num_labels": 5} - self.param_states = {} - - @staticmethod - def model_init(model, - vocab_size, - num_labels, - word_emb_dim=128, - grnn_hidden_dim=128, - emb_learning_rate=0.1, - crf_learning_rate=0.1, - bigru_num=2, - init_bound=0.1): - model.tagger = SequenceTagging(vocab_size, num_labels, word_emb_dim, - grnn_hidden_dim, emb_learning_rate, - crf_learning_rate, bigru_num, init_bound) - - @staticmethod - def model_forward(model, word, lengths, target=None): - return model.tagger(word, lengths, target) - - def make_inputs(self): - inputs = [ - Input([None, None], "int64", "word"), - Input([None], "int64", "lengths"), - Input([None, None], "int64", "target"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestSequenceTaggingInfer(TestSequenceTagging): - def setUp(self): - super(TestSequenceTaggingInfer, self).setUp() - self.inputs = self.inputs[:2] # remove target - - def make_inputs(self): - inputs = super(TestSequenceTaggingInfer, - self).make_inputs()[:2] # remove target - return inputs - - -class TestStackedRNN(ModuleApiTest): - def setUp(self): - shape = (2, 4, 16) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2} - self.param_states = {} - - @staticmethod - def model_init(model, input_size, hidden_size, num_layers): - cells = [ - BasicLSTMCell(input_size, hidden_size), - BasicLSTMCell(hidden_size, hidden_size) - ] - stacked_cell = StackedRNNCell(cells) - model.lstm = RNN(stacked_cell) - - @staticmethod - def model_forward(self, inputs): - return self.lstm(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestLSTM(ModuleApiTest): - def setUp(self): - shape = (2, 4, 16) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2} - self.param_states = {} - - @staticmethod - def model_init(model, input_size, hidden_size, num_layers): - model.lstm = LSTM(input_size, hidden_size, num_layers=num_layers) - - @staticmethod - def model_forward(model, inputs): - return model.lstm(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestBiLSTM(ModuleApiTest): - def setUp(self): - shape = (2, 4, 16) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2} - self.param_states = {} - - @staticmethod - def model_init(model, - input_size, - hidden_size, - num_layers, - merge_mode="concat", - merge_each_layer=False): - model.bilstm = BidirectionalLSTM( - input_size, - hidden_size, - num_layers=num_layers, - merge_mode=merge_mode, - merge_each_layer=merge_each_layer) - - @staticmethod - def model_forward(model, inputs): - return model.bilstm(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output_merge0(self): - self.check_output() - - def test_check_output_merge1(self): - self.attrs["merge_each_layer"] = True - self.check_output() - - -class TestGRU(ModuleApiTest): - def setUp(self): - shape = (2, 4, 64) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2} - self.param_states = {} - - @staticmethod - def model_init(model, input_size, hidden_size, num_layers): - model.gru = GRU(input_size, hidden_size, num_layers=num_layers) - - @staticmethod - def model_forward(model, inputs): - return model.gru(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -class TestBiGRU(ModuleApiTest): - def setUp(self): - shape = (2, 4, 64) - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2} - self.param_states = {} - - @staticmethod - def model_init(model, - input_size, - hidden_size, - num_layers, - merge_mode="concat", - merge_each_layer=False): - model.bigru = BidirectionalGRU( - input_size, - hidden_size, - num_layers=num_layers, - merge_mode=merge_mode, - merge_each_layer=merge_each_layer) - - @staticmethod - def model_forward(model, inputs): - return model.bigru(inputs)[0] - - def make_inputs(self): - inputs = [ - Input([None, None, self.inputs[-1].shape[-1]], "float32", "input"), - ] - return inputs - - def test_check_output_merge0(self): - self.check_output() - - def test_check_output_merge1(self): - self.attrs["merge_each_layer"] = True - self.check_output() - - -class TestCNNEncoder(ModuleApiTest): - def setUp(self): - shape = (2, 32, 8) # [N, C, H] - self.inputs = [np.random.random(shape).astype("float32")] - self.outputs = None - self.attrs = {"num_channels": 32, "num_filters": 64, "num_layers": 2} - self.param_states = {} - - @staticmethod - def model_init(model, num_channels, num_filters, num_layers): - model.cnn_encoder = CNNEncoder( - num_layers=2, - num_channels=num_channels, - num_filters=num_filters, - filter_size=[2, 3], - pool_size=[7, 6]) - - @staticmethod - def model_forward(model, inputs): - return model.cnn_encoder(inputs) - - def make_inputs(self): - inputs = [ - Input([None, self.inputs[-1].shape[1], None], "float32", "input"), - ] - return inputs - - def test_check_output(self): - self.check_output() - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py index 083bfbd1d2528..b6f8ea6bcc7e4 100644 --- a/python/paddle/text/__init__.py +++ b/python/paddle/text/__init__.py @@ -12,11 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import text -from .text import * - from . import datasets from .datasets import * -__all__ = text.__all__ \ - + datasets.__all__ +__all__ = datasets.__all__ diff --git a/python/paddle/text/text.py b/python/paddle/text/text.py deleted file mode 100644 index a0fa4791c5b1c..0000000000000 --- a/python/paddle/text/text.py +++ /dev/null @@ -1,3965 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy -import collections -import six -import sys -from functools import partial, reduce - -import numpy as np - -import paddle -import paddle.fluid as fluid -import paddle.fluid.layers.utils as utils -from paddle.fluid import layers -from paddle.fluid.layers import BeamSearchDecoder -from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as -from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D -from paddle.fluid.data_feeder import convert_dtype - -__all__ = [ - 'RNNCell', - 'BasicLSTMCell', - 'BasicGRUCell', - 'RNN', - 'BidirectionalRNN', - 'StackedRNNCell', - 'StackedLSTMCell', - 'LSTM', - 'BidirectionalLSTM', - 'StackedGRUCell', - 'GRU', - 'BidirectionalGRU', - 'DynamicDecode', - 'BeamSearchDecoder', - 'Conv1dPoolLayer', - 'CNNEncoder', - 'MultiHeadAttention', - 'FFN', - 'TransformerEncoderLayer', - 'TransformerEncoder', - 'TransformerDecoderLayer', - 'TransformerDecoder', - 'TransformerCell', - 'TransformerBeamSearchDecoder', - 'LinearChainCRF', - 'CRFDecoding', - 'SequenceTagging', -] - - -class RNNCell(Layer): - """ - RNNCell is the base class for abstraction representing the calculations - mapping the input and state to the output and new state. It is suitable to - and mostly used in RNN. - """ - - def get_initial_states(self, - batch_ref, - shape=None, - dtype=None, - init_value=0, - batch_dim_idx=0): - """ - Generate initialized states according to provided shape, data type and - value. - - Parameters: - batch_ref: A (possibly nested structure of) tensor variable[s]. - The first dimension of the tensor will be used as batch size to - initialize states. - shape: A (possibly nested structure of) shape[s], where a shape is - represented as a list/tuple of integer). -1(for batch size) will - beautomatically inserted if shape is not started with it. If None, - property `state_shape` will be used. The default value is None. - dtype: A (possibly nested structure of) data type[s]. The structure - must be same as that of `shape`, except when all tensors' in states - has the same data type, a single data type can be used. If None and - property `cell.state_shape` is not available, float32 will be used - as the data type. The default value is None. - init_value: A float value used to initialize states. - batch_dim_idx: An integer indicating which dimension of the tensor in - inputs represents batch size. The default value is 0. - - Returns: - Variable: tensor variable[s] packed in the same structure provided \ - by shape, representing the initialized states. - """ - # TODO: use inputs and batch_size - batch_ref = flatten(batch_ref)[0] - - def _is_shape_sequence(seq): - if sys.version_info < (3, ): - integer_types = ( - int, - long, ) - else: - integer_types = (int, ) - """For shape, list/tuple of integer is the finest-grained objection""" - if (isinstance(seq, list) or isinstance(seq, tuple)): - if reduce(lambda flag, x: isinstance(x, integer_types) and flag, - seq, True): - return False - # TODO: Add check for the illegal - if isinstance(seq, dict): - return True - return (isinstance(seq, collections.Sequence) and - not isinstance(seq, six.string_types)) - - class Shape(object): - def __init__(self, shape): - self.shape = shape if shape[0] == -1 else ([-1] + list(shape)) - - # nested structure of shapes - states_shapes = self.state_shape if shape is None else shape - is_sequence_ori = utils.is_sequence - utils.is_sequence = _is_shape_sequence - states_shapes = map_structure(lambda shape: Shape(shape), states_shapes) - utils.is_sequence = is_sequence_ori - - # nested structure of dtypes - try: - states_dtypes = self.state_dtype if dtype is None else dtype - except NotImplementedError: # use fp32 as default - states_dtypes = "float32" - if len(flatten(states_dtypes)) == 1: - dtype = flatten(states_dtypes)[0] - states_dtypes = map_structure(lambda shape: dtype, states_shapes) - - init_states = map_structure( - lambda shape, dtype: fluid.layers.fill_constant_batch_size_like( - input=batch_ref, - shape=shape.shape, - dtype=dtype, - value=init_value, - input_dim_idx=batch_dim_idx), states_shapes, states_dtypes) - return init_states - - @property - def state_shape(self): - """ - Abstract method (property). - Used to initialize states. - A (possiblely nested structure of) shape[s], where a shape is represented - as a list/tuple of integers (-1 for batch size would be automatically - inserted into a shape if shape is not started with it). - Not necessary to be implemented if states are not initialized by - `get_initial_states` or the `shape` argument is provided when using - `get_initial_states`. - """ - raise NotImplementedError( - "Please add implementaion for `state_shape` in the used cell.") - - @property - def state_dtype(self): - """ - Abstract method (property). - Used to initialize states. - A (possiblely nested structure of) data types[s]. The structure must be - same as that of `shape`, except when all tensors' in states has the same - data type, a signle data type can be used. - Not necessary to be implemented if states are not initialized - by `get_initial_states` or the `dtype` argument is provided when using - `get_initial_states`. - """ - raise NotImplementedError( - "Please add implementaion for `state_dtype` in the used cell.") - - -class BasicLSTMCell(RNNCell): - """ - Long-Short Term Memory(LSTM) RNN cell. - - The formula used is as follows: - - .. math:: - - i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - - f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - - c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - - o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - - h_{t} & = o_{t} act_c (c_{t}) - - Please refer to `An Empirical Exploration of Recurrent Network Architectures - `_ for more details. - - Parameters: - input_size (int): The input size in the LSTM cell. - hidden_size (int): The hidden size in the LSTM cell. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - weight matrix. Default: None. - bias_attr (ParamAttr, optional): The parameter attribute for the bias - of LSTM. Default: None. - gate_activation (function, optional): The activation function for gates - of LSTM, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - LSTM, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - forget_bias(float, optional): forget bias used when computing forget gate. - Default 1.0 - dtype(string, optional): The data type used in this cell. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import BasicLSTMCell, RNN - - inputs = paddle.rand((2, 4, 32)) - cell = BasicLSTMCell(input_size=32, hidden_size=64) - rnn = RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ - - def __init__(self, - input_size, - hidden_size, - param_attr=None, - bias_attr=None, - gate_activation=None, - activation=None, - forget_bias=1.0, - dtype='float32'): - super(BasicLSTMCell, self).__init__() - - self._hidden_size = hidden_size - self._param_attr = param_attr - self._bias_attr = bias_attr - self._gate_activation = gate_activation or layers.sigmoid - self._activation = activation or layers.tanh - # TODO(guosheng): find better way to resolve constants in __init__ - self._forget_bias = layers.create_global_var( - shape=[1], dtype=dtype, value=forget_bias, persistable=True) - # TODO(guosheng): refine this if recurrent_op removes gradient require - self._forget_bias.stop_gradient = False - self._dtype = dtype - self._input_size = input_size - - self._weight = self.create_parameter( - attr=self._param_attr, - shape=[ - self._input_size + self._hidden_size, 4 * self._hidden_size - ], - dtype=self._dtype) - - self._bias = self.create_parameter( - attr=self._bias_attr, - shape=[4 * self._hidden_size], - dtype=self._dtype, - is_bias=True) - - def forward(self, inputs, states): - """ - Performs single step LSTM calculations. - - Parameters: - inputs (Variable): A tensor with shape `[batch_size, input_size]`, - corresponding to :math:`x_t` in the formula. The data type - should be float32 or float64. - states (Variable): A list of containing two tensors, each shaped - `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}` - in the formula. The data type should be float32 or float64. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula; `new_states` is a list containing \ - two tenser variables shaped `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}, c_{t}` in the formula. The data type of these \ - tensors all is same as that of `states`. - """ - pre_hidden, pre_cell = states - concat_input_hidden = layers.concat([inputs, pre_hidden], 1) - gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) - gate_input = layers.elementwise_add(gate_input, self._bias) - i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) - new_cell = layers.elementwise_add( - layers.elementwise_mul( - pre_cell, - self._gate_activation( - layers.elementwise_add(f, self._forget_bias))), - layers.elementwise_mul( - self._gate_activation(i), self._activation(j))) - new_hidden = self._activation(new_cell) * self._gate_activation(o) - - return new_hidden, [new_hidden, new_cell] - - @property - def state_shape(self): - """ - The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]` - (-1 for batch size would be automatically inserted into shape). These two - shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately. - """ - return [[self._hidden_size], [self._hidden_size]] - - -class BasicGRUCell(RNNCell): - """ - Gated Recurrent Unit (GRU) RNN cell. - - The formula for GRU used is as follows: - - .. math:: - - u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) - - r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - - \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - - Please refer to `An Empirical Exploration of Recurrent Network Architectures - `_ for more details. - - Parameters: - input_size (int): The input size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - weight matrix. Default: None. - bias_attr (ParamAttr, optional): The parameter attribute for the bias - of LSTM. Default: None. - gate_activation (function, optional): The activation function for gates - of GRU, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - GRU, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - dtype(string, optional): The data type used in this cell. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import BasicGRUCell, RNN - - inputs = paddle.rand((2, 4, 32)) - cell = BasicGRUCell(input_size=32, hidden_size=64) - rnn = RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ - - def __init__(self, - input_size, - hidden_size, - param_attr=None, - bias_attr=None, - gate_activation=None, - activation=None, - dtype='float32'): - super(BasicGRUCell, self).__init__() - self._input_size = input_size - self._hidden_size = hidden_size - self._param_attr = param_attr - self._bias_attr = bias_attr - self._gate_activation = gate_activation or layers.sigmoid - self._activation = activation or layers.tanh - self._dtype = dtype - - if self._param_attr is not None and self._param_attr.name is not None: - gate_param_attr = copy.deepcopy(self._param_attr) - candidate_param_attr = copy.deepcopy(self._param_attr) - gate_param_attr.name += "_gate" - candidate_param_attr.name += "_candidate" - else: - gate_param_attr = self._param_attr - candidate_param_attr = self._param_attr - - self._gate_weight = self.create_parameter( - attr=gate_param_attr, - shape=[ - self._input_size + self._hidden_size, 2 * self._hidden_size - ], - dtype=self._dtype) - - self._candidate_weight = self.create_parameter( - attr=candidate_param_attr, - shape=[self._input_size + self._hidden_size, self._hidden_size], - dtype=self._dtype) - - if self._bias_attr is not None and self._bias_attr.name is not None: - gate_bias_attr = copy.deepcopy(self._bias_attr) - candidate_bias_attr = copy.deepcopy(self._bias_attr) - gate_bias_attr.name += "_gate" - candidate_bias_attr.name += "_candidate" - else: - gate_bias_attr = self._bias_attr - candidate_bias_attr = self._bias_attr - - self._gate_bias = self.create_parameter( - attr=gate_bias_attr, - shape=[2 * self._hidden_size], - dtype=self._dtype, - is_bias=True) - self._candidate_bias = self.create_parameter( - attr=candidate_bias_attr, - shape=[self._hidden_size], - dtype=self._dtype, - is_bias=True) - - def forward(self, inputs, states): - """ - Performs single step GRU calculations. - - Parameters: - inputs (Variable): A tensor with shape `[batch_size, input_size]`, - corresponding to :math:`x_t` in the formula. The data type - should be float32 or float64. - states (Variable): A tensor with shape `[batch_size, hidden_size]`. - corresponding to :math:`h_{t-1}` in the formula. The data type - should be float32 or float64. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \ - `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \ - corresponding to :math:`h_t` in the formula. The data type of the \ - tensor is same as that of `states`. - """ - pre_hidden = states - concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1) - - gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) - - gate_input = layers.elementwise_add(gate_input, self._gate_bias) - - gate_input = self._gate_activation(gate_input) - r, u = layers.split(gate_input, num_or_sections=2, dim=1) - - r_hidden = r * pre_hidden - - candidate = layers.matmul( - layers.concat([inputs, r_hidden], 1), self._candidate_weight) - candidate = layers.elementwise_add(candidate, self._candidate_bias) - - c = self._activation(candidate) - new_hidden = u * pre_hidden + (1 - u) * c - - return new_hidden, new_hidden - - @property - def state_shape(self): - """ - The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch - size would be automatically inserted into shape). The shape corresponds - to :math:`h_{t-1}`. - """ - return [self._hidden_size] - - -class RNN(Layer): - """ - RNN creates a recurrent neural network specified by RNNCell `cell`, which - performs :code:`cell.forward()` repeatedly until reaches to the maximum - length of `inputs`. - - Parameters: - cell(RNNCell): An instance of `RNNCell`. - is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import StackedLSTMCell, RNN - - inputs = paddle.rand((2, 4, 32)) - cell = StackedLSTMCell(input_size=32, hidden_size=64) - rnn = RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ - - def __init__(self, cell, is_reverse=False, time_major=False): - super(RNN, self).__init__() - self.cell = cell - if not hasattr(self.cell, "call"): - self.cell.call = self.cell.forward - self.is_reverse = is_reverse - self.time_major = time_major - self.batch_index, self.time_step_index = (1, 0) if time_major else (0, - 1) - - def forward(self, - inputs, - initial_states=None, - sequence_length=None, - **kwargs): - """ - Performs :code:`cell.forward()` repeatedly until reaches to the maximum - length of `inputs`. - - Parameters: - inputs (Variable): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in RNN. - initial_states (Variable, optional): A (possibly nested structure of) - tensor variable[s], representing the initial state for RNN. - If not provided, `cell.get_initial_states` would be used to produce - the initial state. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. - - Returns: - tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ - outputs and states, both are Tensor or nested structure of Tensor. \ - `final_outputs` has the same structure and data types as \ - the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \ - stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ - for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ - `final_states` is the counterpart at last time step of initial states, \ - thus has the same structure with it and has tensors with same shapes \ - and data types. - """ - if fluid.in_dygraph_mode(): - - class ArrayWrapper(object): - def __init__(self, x): - self.array = [x] - - def append(self, x): - self.array.append(x) - return self - - def _maybe_copy(state, new_state, step_mask): - # TODO: use where_op - new_state = fluid.layers.elementwise_mul( - new_state, step_mask, - axis=0) - fluid.layers.elementwise_mul( - state, (step_mask - 1), axis=0) - return new_state - - flat_inputs = flatten(inputs) - batch_size, time_steps = ( - flat_inputs[0].shape[self.batch_index], - flat_inputs[0].shape[self.time_step_index]) - - if initial_states is None: - initial_states = self.cell.get_initial_states( - batch_ref=inputs, batch_dim_idx=self.batch_index) - - if not self.time_major: - inputs = map_structure( - lambda x: fluid.layers.transpose(x, [1, 0] + list( - range(2, len(x.shape)))), inputs) - - if sequence_length is not None: - mask = fluid.layers.sequence_mask( - sequence_length, - maxlen=time_steps, - dtype=flatten(initial_states)[0].dtype) - mask = fluid.layers.transpose(mask, [1, 0]) - - if self.is_reverse: - inputs = map_structure( - lambda x: fluid.layers.reverse(x, axis=[0]), inputs) - mask = fluid.layers.reverse( - mask, axis=[0]) if sequence_length is not None else None - - states = initial_states - outputs = [] - for i in range(time_steps): - step_inputs = map_structure(lambda x: x[i], inputs) - step_outputs, new_states = self.cell(step_inputs, states, - **kwargs) - if sequence_length is not None: - new_states = map_structure( - partial( - _maybe_copy, step_mask=mask[i]), - states, - new_states) - states = new_states - outputs = map_structure( - lambda x: ArrayWrapper(x), - step_outputs) if i == 0 else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, - outputs) - - final_outputs = map_structure( - lambda x: fluid.layers.stack(x.array, axis=self.time_step_index - ), outputs) - - if self.is_reverse: - final_outputs = map_structure( - lambda x: fluid.layers.reverse(x, axis=self.time_step_index - ), final_outputs) - - final_states = new_states - else: - final_outputs, final_states = fluid.layers.rnn( - self.cell, - inputs, - initial_states=initial_states, - sequence_length=sequence_length, - time_major=self.time_major, - is_reverse=self.is_reverse, - **kwargs) - return final_outputs, final_states - - -class StackedRNNCell(RNNCell): - """ - Wrapper allowing a stack of RNN cells to behave as a single cell. It is used - to implement stacked RNNs. - - Parameters: - cells (list|tuple): List of RNN cell instances. - - Examples: - - .. code-block:: python - - from paddle.text import BasicLSTMCell, StackedRNNCell - - cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)] - stack_rnn = StackedRNNCell(cells) - """ - - def __init__(self, cells): - super(StackedRNNCell, self).__init__() - self.cells = [] - for i, cell in enumerate(cells): - self.cells.append(self.add_sublayer("cell_%d" % i, cell)) - - def forward(self, inputs, states, **kwargs): - """ - Performs :code:`cell.forward` for all including cells sequentially. - Each cell's `inputs` is the `outputs` of the previous cell. And each - cell's `states` is the corresponding one in `states`. - - Parameters: - inputs (Variable): The inputs for the first cell. Mostly it is a - float32 or float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - **kwargs: Additional keyword arguments, which passed to `cell.forward` - for all including cells. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \ - `outputs` of the last cell. `new_states` is a list composed \ - of all cells' `new_states`, and its structure and data type is \ - same as that of `states` argument. - """ - new_states = [] - for cell, state in zip(self.cells, states): - outputs, new_state = cell(inputs, state, **kwargs) - inputs = outputs - new_states.append(new_state) - return outputs, new_states - - @staticmethod - def stack_param_attr(param_attr, n): - """ - If `param_attr` is a list or tuple, convert every element in it to a - ParamAttr instance. Otherwise, repeat `param_attr` `n` times to - construct a list, and rename every one by appending a increasing index - suffix to avoid having same names when `param_attr` contains a name. - - Parameters: - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. - n (int): The times to repeat to construct a list when `param_attr` - is not a list or tuple. - - Returns: - list: A list composed of each including cell's `param_attr`. - """ - if isinstance(param_attr, (list, tuple)): - assert len(param_attr) == n, ( - "length of param_attr should be %d when it is a list/tuple" % n) - param_attrs = [ - fluid.ParamAttr._to_attr(attr) for attr in param_attr - ] - else: - param_attrs = [] - attr = fluid.ParamAttr._to_attr(param_attr) - for i in range(n): - attr_i = copy.deepcopy(attr) - if attr.name: - attr_i.name = attr_i.name + "_" + str(i) - param_attrs.append(attr_i) - return param_attrs - - @property - def state_shape(self): - """ - The `state_shape` of StackedRNNCell is a list composed of each including - cell's `state_shape`. - - Returns: - list: A list composed of each including cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] - - -class StackedLSTMCell(RNNCell): - """ - Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used - to implement stacked LSTM. - - The formula for LSTM used here is as follows: - - .. math:: - - i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - - f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - - c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - - o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - - h_{t} & = o_{t} act_c (c_{t}) - - - Parameters: - input_size (int): The input size for the first LSTM cell. - hidden_size (int): The hidden size for every LSTM cell. - gate_activation (function, optional): The activation function for gates - of LSTM, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - LSTM, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - forget_bias (float, optional): forget bias used when computing forget - gate. It also can accept a boolean value `True`, which would set - :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and - :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in - http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf . - Default 1.0. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float|list|tuple, optional): The dropout probability after each - LSTM. It also can be a list or tuple, including dropout probabilities - for the corresponding LSTM. Default 0.0 - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import StackedLSTMCell, RNN - - inputs = paddle.rand((2, 4, 32)) - cell = StackedLSTMCell(input_size=32, hidden_size=64) - rnn = RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ - - def __init__(self, - input_size, - hidden_size, - gate_activation=None, - activation=None, - forget_bias=1.0, - num_layers=1, - dropout=0.0, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(StackedLSTMCell, self).__init__() - self.dropout = utils.convert_to_list(dropout, num_layers, "dropout", - float) - param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) - bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) - - self.cells = [] - for i in range(num_layers): - if forget_bias is True: - bias_attrs[ - i].initializer = fluid.initializer.NumpyArrayInitializer( - np.concatenate( - np.zeros(2 * hidden_size), - np.ones(hidden_size), np.zeros(hidden_size)).astype( - dtype)) - forget_bias = 0.0 - self.cells.append( - self.add_sublayer( - "lstm_%d" % i, - BasicLSTMCell( - input_size=input_size if i == 0 else hidden_size, - hidden_size=hidden_size, - gate_activation=gate_activation, - activation=activation, - forget_bias=forget_bias, - param_attr=param_attrs[i], - bias_attr=bias_attrs[i], - dtype=dtype))) - - def forward(self, inputs, states): - """ - Performs the stacked LSTM cells sequentially. Each cell's `inputs` is - the `outputs` of the previous cell. And each cell's `states` is the - corresponding one in `states`. - - Parameters: - inputs (Variable): The inputs for the first cell. It is a float32 or - float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - **kwargs: Additional keyword arguments, which passed to `cell.forward` - for all including cells. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula of the last LSTM; `new_states` \ - is a list composed of every LSTM `new_states` which is a pair \ - of tensors standing for :math:`h_{t}, c_{t}` in the formula, \ - and the data type and structure of these tensors all is same \ - as that of `states`. - """ - new_states = [] - for i, cell in enumerate(self.cells): - outputs, new_state = cell(inputs, states[i]) - outputs = layers.dropout( - outputs, - self.dropout[i], - dropout_implementation='upscale_in_train') if self.dropout[ - i] > 0 else outputs - inputs = outputs - new_states.append(new_state) - return outputs, new_states - - @property - def state_shape(self): - """ - The `state_shape` of StackedLSTMCell is a list composed of each including - LSTM cell's `state_shape`. - - Returns: - list: A list composed of each including LSTM cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] - - -class LSTM(Layer): - """ - Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input - sequence. - - The formula for LSTM used here is as follows: - - .. math:: - - i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - - f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - - c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - - o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - - h_{t} & = o_{t} act_c (c_{t}) - - - Parameters: - input_size (int): The input feature size for the first LSTM. - hidden_size (int): The hidden size for every LSTM. - gate_activation (function, optional): The activation function for gates - of LSTM, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - LSTM, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - forget_bias (float, optional): forget bias used when computing forget - gate. It also can accept a boolean value `True`, which would set - :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and - :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in - http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf . - Default 1.0. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float|list|tuple, optional): The dropout probability after each - LSTM. It also can be a list or tuple, including dropout probabilities - for the corresponding LSTM. Default 0.0 - is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import LSTM - - inputs = paddle.rand((2, 4, 32)) - lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) - outputs, _ = lstm(inputs) # [2, 4, 64] - """ - - def __init__(self, - input_size, - hidden_size, - gate_activation=None, - activation=None, - forget_bias=1.0, - num_layers=1, - dropout=0.0, - is_reverse=False, - time_major=False, - param_attr=None, - bias_attr=None, - dtype='float32'): - super(LSTM, self).__init__() - lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation, - activation, forget_bias, num_layers, - dropout, param_attr, bias_attr, dtype) - self.lstm = RNN(lstm_cell, is_reverse, time_major) - - def forward(self, inputs, initial_states=None, sequence_length=None): - """ - Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` - is the `inputs` of the subsequent one. - - Parameters: - inputs (Variable): The inputs for the first LSTM. It is a float32 - or float64 tensor shaped `[batch_size, sequence_length, input_size]`. - initial_states (list|None, optional): A list containing initial states - of all stacked LSTM, and the initial states of each LSTM is a pair - of tensors shaped `[batch_size, hidden_size]`. If not provided, - use 0 as initial states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is the output of last LSTM and it is a tensor with shape \ - `[batch_size, sequence_length, hidden_size]` and has the same \ - data type as `inputs`, `final_states` is the counterpart of \ - `initial_states` at last time step, thus has the same structure \ - with it and has tensors with same shapes data types. - """ - return self.lstm(inputs, initial_states, sequence_length) - - -class BidirectionalRNN(Layer): - """ - Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform - forward and backward RNN separately, and merge outputs of these two RNN - according to `merge_mode`. - - Parameters: - cell_fw (RNNCell): A RNNCell instance used for forward RNN. - cell_bw (RNNCell): A RNNCell instance used for backward RNN. - merge_mode (str|None, optional): The way to merget outputs of forward and - backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, - where None stands for make the two `outputs` as a tuple, `zip` stands - for make each two corresponding tensors of the two `outputs` as a tuple. - Default `concat` - - Examples: - - .. code-block:: python - - import paddle - from paddle.text import StackedLSTMCell, BidirectionalRNN - - inputs = paddle.rand((2, 4, 32)) - cell_fw = StackedLSTMCell(32, 64) - cell_bw = StackedLSTMCell(32, 64) - bi_rnn = BidirectionalRNN(cell_fw, cell_bw) - outputs, _ = bi_rnn(inputs) # [2, 4, 128] - """ - - def __init__(self, - cell_fw, - cell_bw, - merge_mode='concat', - time_major=False, - cell_cls=None, - **kwargs): - super(BidirectionalRNN, self).__init__() - self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major) - self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major) - if merge_mode == 'concat': - self.merge_func = lambda x, y: layers.concat([x, y], -1) - elif merge_mode == 'sum': - self.merge_func = lambda x, y: layers.elementwise_add(x, y) - elif merge_mode == 'ave': - self.merge_func = lambda x, y: layers.scale( - layers.elementwise_add(x, y), 0.5) - elif merge_mode == 'mul': - self.merge_func = lambda x, y: layers.elementwise_mul(x, y) - elif merge_mode == 'zip': - self.merge_func = lambda x, y: (x, y) - elif merge_mode is None: - self.merge_func = None - else: - raise ValueError('Unsupported value for `merge_mode`: %s' % - merge_mode) - - def forward(self, - inputs, - initial_states=None, - sequence_length=None, - **kwargs): - """ - Performs forward and backward RNN separately, and merge outputs of these - two RNN according to `merge_mode`. - - Parameters: - inputs (Variable): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in both forward and backward RNN. - initial_states (Variable|list|tuple): If it is a list or tuple, its - length should be 2 to include initial states of forward and backward - RNN separately. Otherwise it would be used twice for the two RNN. - If None, `cell.get_initial_states` would be used to produce the initial - states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. - - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is produced by merge outputs of forward and backward RNN according \ - to `merge_mode`, `final_states` is a pair including `final_states` \ - of forward and backward RNN. - """ - if isinstance(initial_states, (list, tuple)): - assert len( - initial_states - ) == 2, "length of initial_states should be 2 when it is a list/tuple" - else: - initial_states = [initial_states, initial_states] - outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0], - sequence_length, **kwargs) - outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1], - sequence_length, **kwargs) - outputs = map_structure(self.merge_func, outputs_fw, - outputs_bw) if self.merge_func else (outputs_fw, - outputs_bw) - return outputs, (states_fw, states_bw) - - @staticmethod - def bidirect_param_attr(param_attr): - """ - Converts `param_attr` to a pair of `param_attr` when it is not a list - or tuple with length 2, also rename every one by appending a suffix to - avoid having same names when `param_attr` contains a name. - - Parameters: - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. When - it is a list or tuple, its length must be 2. - - Returns: - list: A pair composed of forward and backward RNN cell's `param_attr`. - """ - if isinstance(param_attr, (list, tuple)): - assert len( - param_attr - ) == 2, "length of param_attr should be 2 when it is a list/tuple" - param_attrs = param_attr - else: - param_attrs = [] - attr = fluid.ParamAttr._to_attr(param_attr) - attr_fw = copy.deepcopy(attr) - if attr.name: - attr_fw.name = attr_fw.name + "_fw" - param_attrs.append(attr_fw) - attr_bw = copy.deepcopy(attr) - if attr.name: - attr_bw.name = attr_bw.name + "_bw" - param_attrs.append(attr_bw) - return param_attrs - - -class BidirectionalLSTM(Layer): - """ - Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an - input sequence. - - Bidirection interaction can happen after each layer or only after the last - layer according to the `merge_each_layer` setting. The way to interact, - that is how to merge outputs of the two direction, is determined by `merge_mode`. - - The formula for LSTM used here is as follows: - - .. math:: - - i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - - f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - - c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - - o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - - h_{t} & = o_{t} act_c (c_{t}) - - - Parameters: - input_size (int): The input feature size for the first LSTM. - hidden_size (int): The hidden size for every LSTM. - gate_activation (function, optional): The activation function for gates - of LSTM, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - LSTM, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - forget_bias (float, optional): forget bias used when computing forget - gate. It also can accept a boolean value `True`, which would set - :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and - :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in - http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf . - Default 1.0. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float|list|tuple, optional): The dropout probability after each - LSTM. It also can be a list or tuple, including dropout probabilities - for the corresponding LSTM. Default 0.0 - merge_mode (str|None, optional): The way to merget outputs of forward and - backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, - where None stands for make the two `outputs` as a tuple, `zip` stands - for make each two corresponding tensors of the two `outputs` as a tuple. - Default `concat` - merge_each_layer (bool, optional): Indicate whether bidirection interaction - happens after each layer or only after the last layer. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import BidirectionalLSTM - - inputs = paddle.rand((2, 4, 32)) - bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2) - outputs, _ = bi_lstm(inputs) # [2, 4, 128] - """ - - def __init__(self, - input_size, - hidden_size, - gate_activation=None, - activation=None, - forget_bias=1.0, - num_layers=1, - dropout=0.0, - merge_mode='concat', - merge_each_layer=False, - time_major=False, - param_attr=None, - bias_attr=None, - dtype='float32'): - super(BidirectionalLSTM, self).__init__() - self.num_layers = num_layers - self.merge_mode = merge_mode - self.merge_each_layer = merge_each_layer - param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) - bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) - if not merge_each_layer: - cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation, - activation, forget_bias, num_layers, - dropout, param_attrs[0], bias_attrs[0], - dtype) - cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation, - activation, forget_bias, num_layers, - dropout, param_attrs[1], bias_attrs[1], - dtype) - self.lstm = BidirectionalRNN( - cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major) - else: - fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], - num_layers) - bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], - num_layers) - fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], - num_layers) - bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], - num_layers) - - # maybe design cell including both forward and backward later - self.lstm = [] - for i in range(num_layers): - cell_fw = StackedLSTMCell( - input_size - if i == 0 else (hidden_size * 2 - if merge_mode == 'concat' else hidden_size), - hidden_size, gate_activation, activation, forget_bias, 1, - dropout, fw_param_attrs[i], fw_bias_attrs[i], dtype) - cell_bw = StackedLSTMCell( - input_size - if i == 0 else (hidden_size * 2 - if merge_mode == 'concat' else hidden_size), - hidden_size, gate_activation, activation, forget_bias, 1, - dropout, bw_param_attrs[i], bw_bias_attrs[i], dtype) - self.lstm.append( - self.add_sublayer( - "lstm_%d" % i, - BidirectionalRNN( - cell_fw, - cell_bw, - merge_mode=merge_mode, - time_major=time_major))) - - def forward(self, inputs, initial_states=None, sequence_length=None): - """ - Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs` - is the `inputs` of the subsequent one, or when `merge_each_layer` is True, - merged outputs would be the `inputs` of the subsequent one. - - Parameters: - inputs (Variable): The inputs for the first LSTM. It is a float32 - or float64 tensor shaped `[batch_size, sequence_length, input_size]`. - initial_states (list|None, optional): A list containing initial states - of all stacked LSTM. If `merge_each_layer` is True, the length of - list should be `num_layers` and a single value would be reused for - `num_layers`; Otherwise, the length should be 2 and a single value - would be reused twice. If not provided, use 0 as initial states. - Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is the output of last bidirectional LSTM; `final_states` is a \ - pair including `final_states` of forward and backward LSTM when \ - `merge_each_layer` is False or a list including `final_states` \ - of all stacked bidirectional LSTM, and it has tensors with same \ - shapes data types as `initial_states`. - """ - if not self.merge_each_layer: - return self.lstm(inputs, initial_states, sequence_length) - else: - if isinstance(initial_states, (list, tuple)): - assert len(initial_states) == self.num_layers, ( - "length of initial_states should be %d when it is a list/tuple" - % self.num_layers) - else: - initial_states = [initial_states] * self.num_layers - stacked_states = [] - for i in range(self.num_layers): - outputs, states = self.lstm[i](inputs, initial_states[i], - sequence_length) - inputs = outputs - stacked_states.append(states) - return outputs, stacked_states - - -class StackedGRUCell(RNNCell): - """ - Wrapper allowing a stack of GRU cells to behave as a single cell. It is used - to implement stacked GRU. - - The formula for GRU used here is as follows: - - .. math:: - - u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) - - r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - - \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - - - Parameters: - input_size (int): The input size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - gate_activation (function, optional): The activation function for gates - of GRU, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - GRU, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float|list|tuple, optional): The dropout probability after each - GRU. It also can be a list or tuple, including dropout probabilities - for the corresponding GRU. Default 0.0 - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import StackedGRUCell, RNN - - inputs = paddle.rand((2, 4, 32)) - cell = StackedGRUCell(input_size=32, hidden_size=64) - rnn = RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ - - def __init__(self, - input_size, - hidden_size, - gate_activation=None, - activation=None, - num_layers=1, - dropout=0.0, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(StackedGRUCell, self).__init__() - self.dropout = utils.convert_to_list(dropout, num_layers, "dropout", - float) - param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) - bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) - - self.cells = [] - for i in range(num_layers): - self.cells.append( - self.add_sublayer( - "gru_%d" % i, - BasicGRUCell( - input_size=input_size if i == 0 else hidden_size, - hidden_size=hidden_size, - gate_activation=gate_activation, - activation=activation, - param_attr=param_attrs[i], - bias_attr=bias_attrs[i], - dtype=dtype))) - - def forward(self, inputs, states): - """ - Performs the stacked GRU cells sequentially. Each cell's `inputs` is - the `outputs` of the previous cell. And each cell's `states` is the - corresponding one in `states`. - - Parameters: - inputs (Variable): The inputs for the first cell. It is a float32 or - float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - **kwargs: Additional keyword arguments, which passed to `cell.forward` - for all including cells. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula of the last GRU; `new_states` \ - is a list composed of every GRU `new_states` which is also \ - :math:`h_{t}` in the formula, and the data type and structure \ - of these tensors all is same as that of `states`. - """ - new_states = [] - for i, cell in enumerate(self.cells): - outputs, new_state = cell(inputs, states[i]) - outputs = layers.dropout( - outputs, - self.dropout[i], - dropout_implementation='upscale_in_train') if self.dropout[ - i] > 0 else outputs - inputs = outputs - new_states.append(new_state) - return outputs, new_states - - @property - def state_shape(self): - """ - The `state_shape` of StackedGRUCell is a list composed of each including - GRU cell's `state_shape`. - - Returns: - list: A list composed of each including GRU cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] - - -class GRU(Layer): - """ - Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input - sequence. - - The formula for GRU used here is as follows: - - .. math:: - - u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) - - r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - - \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - - - Parameters: - input_size (int): The input feature size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - gate_activation (function, optional): The activation function for gates - of GRU, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - GRU, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - num_layers(int, optional): The number of GRU to be stacked. Default 1. - dropout(float|list|tuple, optional): The dropout probability after each - GRU. It also can be a list or tuple, including dropout probabilities - for the corresponding GRU. Default 0.0 - is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import GRU - - inputs = paddle.rand((2, 4, 32)) - gru = GRU(input_size=32, hidden_size=64, num_layers=2) - outputs, _ = gru(inputs) # [2, 4, 64] - """ - - def __init__(self, - input_size, - hidden_size, - gate_activation=None, - activation=None, - num_layers=1, - dropout=0.0, - is_reverse=False, - time_major=False, - param_attr=None, - bias_attr=None, - dtype='float32'): - super(GRU, self).__init__() - gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation, - activation, num_layers, dropout, param_attr, - bias_attr, dtype) - self.gru = RNN(gru_cell, is_reverse, time_major) - - def forward(self, inputs, initial_states=None, sequence_length=None): - """ - Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs` - is the `inputs` of the subsequent one. - - Parameters: - inputs (Variable): The inputs for the first GRU. It is a float32 - or float64 tensor shaped `[batch_size, sequence_length, input_size]`. - initial_states (list|None, optional): A list containing initial states - of all stacked GRU, and the initial states of each GRU is a tensor - shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial - states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is the output of last GRU and it is a tensor with shape \ - `[batch_size, sequence_length, hidden_size]` and has the same \ - data type as `inputs`, `final_states` is the counterpart of \ - `initial_states` at last time step, thus has the same structure \ - with it and has tensors with same shapes data types. - """ - return self.gru(inputs, initial_states, sequence_length) - - -class BidirectionalGRU(Layer): - """ - Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input - sequence. - - Bidirection interaction can happen after each layer or only after the last - layer according to the `merge_each_layer` setting. The way to interact, - that is how to merge outputs of the two direction, is determined by `merge_mode`. - - The formula for GRU used here is as follows: - - .. math:: - - u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) - - r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - - \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - - - Parameters: - input_size (int): The input feature size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - gate_activation (function, optional): The activation function for gates - of GRU, that is :math:`act_g` in the formula. Default: None, - representing for `fluid.layers.sigmoid`. - activation (function, optional): The non-gate activation function of - GRU, that is :math:`act_c` in the formula. Default: None, - representing for 'fluid.layers.tanh'. - num_layers(int, optional): The number of GRU to be stacked. Default 1. - dropout(float|list|tuple, optional): The dropout probability after each - GRU. It also can be a list or tuple, including dropout probabilities - for the corresponding GRU. Default 0.0 - merge_mode (str|None, optional): The way to merget outputs of forward and - backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, - where None stands for make the two `outputs` as a tuple, `zip` stands - for make each two corresponding tensors of the two `outputs` as a tuple. - Default `concat` - merge_each_layer (bool, optional): Indicate whether bidirection interaction - happens after each layer or only after the last layer. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import BidirectionalGRU - - inputs = paddle.rand((2, 4, 32)) - bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2) - outputs, _ = bi_gru(inputs) # [2, 4, 128] - """ - - def __init__(self, - input_size, - hidden_size, - gate_activation=None, - activation=None, - forget_bias=1.0, - num_layers=1, - dropout=0.0, - merge_mode='concat', - merge_each_layer=False, - time_major=False, - param_attr=None, - bias_attr=None, - dtype='float32'): - super(BidirectionalGRU, self).__init__() - self.num_layers = num_layers - self.merge_mode = merge_mode - self.merge_each_layer = merge_each_layer - param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) - bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) - if not merge_each_layer: - cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation, - activation, num_layers, dropout, - param_attrs[0], bias_attrs[0], dtype) - cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation, - activation, num_layers, dropout, - param_attrs[1], bias_attrs[1], dtype) - self.gru = BidirectionalRNN( - cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major) - else: - fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], - num_layers) - bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], - num_layers) - fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], - num_layers) - bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], - num_layers) - - # maybe design cell including both forward and backward later - self.gru = [] - for i in range(num_layers): - cell_fw = StackedGRUCell(input_size if i == 0 else ( - hidden_size * 2 if merge_mode == 'concat' else - hidden_size), hidden_size, gate_activation, activation, 1, - dropout, fw_param_attrs[i], - fw_bias_attrs[i], dtype) - cell_bw = StackedGRUCell(input_size if i == 0 else ( - hidden_size * 2 if merge_mode == 'concat' else - hidden_size), hidden_size, gate_activation, activation, 1, - dropout, bw_param_attrs[i], - bw_bias_attrs[i], dtype) - self.gru.append( - self.add_sublayer( - "gru_%d" % i, - BidirectionalRNN( - cell_fw, - cell_bw, - merge_mode=merge_mode, - time_major=time_major))) - - def forward(self, inputs, initial_states=None, sequence_length=None): - """ - Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs` - is the `inputs` of the subsequent one, or when `merge_each_layer` is True, - merged outputs would be the `inputs` of the subsequent one. - - Parameters: - inputs (Variable): The inputs for the first GRU. It is a float32 - or float64 tensor shaped `[batch_size, sequence_length, input_size]`. - initial_states (list|None, optional): A list containing initial states - of all stacked GRU. If `merge_each_layer` is True, the length of - list should be `num_layers` and a single value would be reused for - `num_layers`; Otherwise, the length should be 2 and a single value - would be reused twice. If not provided, use 0 as initial states. - Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is the output of last bidirectional GRU; `final_states` is a \ - pair including `final_states` of forward and backward GRU when \ - `merge_each_layer` is False or a list including `final_states` \ - of all stacked bidirectional GRU, and it has tensors with same \ - shapes data types as `initial_states`. - """ - if not self.merge_each_layer: - return self.gru(inputs, initial_states, sequence_length) - else: - if isinstance(initial_states, (list, tuple)): - assert len(initial_states) == self.num_layers, ( - "length of initial_states should be %d when it is a list/tuple" - % self.num_layers) - else: - initial_states = [initial_states] * self.num_layers - stacked_states = [] - for i in range(self.num_layers): - outputs, states = self.gru[i](inputs, initial_states[i], - sequence_length) - inputs = outputs - stacked_states.append(states) - return outputs, stacked_states - - -class DynamicDecode(Layer): - """ - DynamicDecode integrates an Decoder instance to perform dynamic decoding. - - It performs :code:`decoder.step()` repeatedly until the returned Tensor - indicating finished status contains all True values or the number of - decoding step reaches to :attr:`max_step_num`. - - :code:`decoder.initialize()` would be called once before the decoding loop. - If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` - would be called once after the decoding loop. - - Parameters: - decoder (Decoder): An instance of `Decoder`. - max_step_num (int, optional): The maximum number of steps. If not provided, - decode until the decoder is fully done, or in other words, the returned - Tensor by :code:`decoder.step()` indicating finished status contains - all True. Default `None`. - output_time_major (bool, optional): Indicate the data layout of Tensor included - in the final outputs(the first returned value of this method). If - attr:`False`, the data layout would be batch major with shape - `[batch_size, seq_len, ...]`. If attr:`True`, the data layout would - be time major with shape `[seq_len, batch_size, ...]`. Default: `False`. - impute_finished (bool, optional): If `True`, then states get copied through - for batch entries which are marked as finished, which differs with the - unfinished using the new states returned by :code:`decoder.step()` and - ensures that the final states have the correct values. Otherwise, states - wouldn't be copied through when finished. If the returned `final_states` - is needed, it should be set as True, which causes some slowdown. - Default `False`. - is_test (bool, optional): A flag indicating whether to use test mode. In - test mode, it is more memory saving. Default `False`. - return_length (bool, optional): A flag indicating whether to return an - extra Tensor variable in the output tuple, which stores the actual - lengths of all decoded sequences. Default `False`. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.fluid.layers import BeamSearchDecoder - from paddle.text import StackedLSTMCell, DynamicDecode - - paddle.disable_static() - - vocab_size, d_model, = 100, 32 - encoder_output = paddle.rand((2, 4, d_model)) - trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model]) - output_layer = fluid.dygraph.Linear(d_model, vocab_size) - cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model) - decoder = BeamSearchDecoder(cell, - start_token=0, - end_token=1, - beam_size=4, - embedding_fn=trg_embeder, - output_fn=output_layer) - dynamic_decoder = DynamicDecode(decoder, max_step_num=10) - outputs = dynamic_decoder(cell.get_initial_states(encoder_output)) - """ - - def __init__(self, - decoder, - max_step_num=None, - output_time_major=False, - impute_finished=False, - is_test=False, - return_length=False): - super(DynamicDecode, self).__init__() - self.decoder = decoder - self.max_step_num = max_step_num - self.output_time_major = output_time_major - self.impute_finished = impute_finished - self.is_test = is_test - self.return_length = return_length - - def forward(self, inits=None, **kwargs): - """ - Performs :code:`decoder.step()` repeatedly until the returned Tensor - indicating finished status contains all True values or the number of - decoding step reaches to :attr:`max_step_num`. - - :code:`decoder.initialize()` would be called once before the decoding loop. - If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` - would be called once after the decoding loop. - - Parameters: - inits (object, optional): Argument passed to `decoder.initialize`. - Default `None`. - **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. - - Returns: - tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \ - when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \ - The final outputs and states, both are Tensor or nested structure of Tensor. \ - `final_outputs` has the same structure and data types as the :code:`outputs` \ - returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \ - is the stacked of all decoding steps' outputs, which might be revised \ - by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \ - `final_states` is the counterpart at last time step of initial states \ - returned by :code:`decoder.initialize()` , thus has the same structure \ - with it and has tensors with same shapes and data types. `sequence_lengths` \ - is an `int64` tensor with the same shape as `finished` returned \ - by :code:`decoder.initialize()` , and it stores the actual lengths of \ - all decoded sequences. - """ - if fluid.in_dygraph_mode(): - - class ArrayWrapper(object): - def __init__(self, x): - self.array = [x] - - def append(self, x): - self.array.append(x) - return self - - def __getitem__(self, item): - return self.array.__getitem__(item) - - def _maybe_copy(state, new_state, step_mask): - # TODO: use where_op - state_dtype = state.dtype - if convert_dtype(state_dtype) in ["bool"]: - state = layers.cast(state, dtype="float32") - new_state = layers.cast(new_state, dtype="float32") - if step_mask.dtype != state.dtype: - step_mask = layers.cast(step_mask, dtype=state.dtype) - # otherwise, renamed bool gradients of would be summed up leading - # to sum(bool) error. - step_mask.stop_gradient = True - new_state = layers.elementwise_mul( - state, step_mask, axis=0) - layers.elementwise_mul( - new_state, (step_mask - 1), axis=0) - if convert_dtype(state_dtype) in ["bool"]: - new_state = layers.cast(new_state, dtype=state_dtype) - return new_state - - initial_inputs, initial_states, initial_finished = self.decoder.initialize( - inits) - inputs, states, finished = (initial_inputs, initial_states, - initial_finished) - cond = layers.logical_not((layers.reduce_all(initial_finished))) - sequence_lengths = layers.cast( - layers.zeros_like(initial_finished), "int64") - outputs = None - - step_idx = 0 - step_idx_tensor = layers.fill_constant( - shape=[1], dtype="int64", value=step_idx) - while cond.numpy(): - (step_outputs, next_states, next_inputs, - next_finished) = self.decoder.step(step_idx_tensor, inputs, - states, **kwargs) - if not self.decoder.tracks_own_finished: - # BeamSearchDecoder would track it own finished, since - # beams would be reordered and the finished status of each - # entry might change. Otherwise, perform logical OR which - # would not change the already finished. - next_finished = layers.logical_or(next_finished, finished) - # To confirm states.finished/finished be consistent with - # next_finished. - layers.assign(next_finished, finished) - next_sequence_lengths = layers.elementwise_add( - sequence_lengths, - layers.cast( - layers.logical_not(finished), sequence_lengths.dtype)) - - if self.impute_finished: # rectify the states for the finished. - next_states = map_structure( - lambda x, y: _maybe_copy(x, y, finished), states, - next_states) - outputs = map_structure( - lambda x: ArrayWrapper(x), - step_outputs) if step_idx == 0 else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, - outputs) - inputs, states, finished, sequence_lengths = ( - next_inputs, next_states, next_finished, - next_sequence_lengths) - - layers.increment(x=step_idx_tensor, value=1.0, in_place=True) - step_idx += 1 - - layers.logical_not(layers.reduce_all(finished), cond) - if self.max_step_num is not None and step_idx > self.max_step_num: - break - - final_outputs = map_structure( - lambda x: fluid.layers.stack(x.array, axis=0), outputs) - final_states = states - - try: - final_outputs, final_states = self.decoder.finalize( - final_outputs, final_states, sequence_lengths) - except NotImplementedError: - pass - - if not self.output_time_major: - final_outputs = map_structure( - lambda x: layers.transpose(x, [1, 0] + list( - range(2, len(x.shape)))), final_outputs) - - return (final_outputs, final_states, - sequence_lengths) if self.return_length else (final_outputs, - final_states) - else: - return fluid.layers.dynamic_decode( - self.decoder, - inits, - max_step_num=self.max_step_num, - output_time_major=self.output_time_major, - impute_finished=self.impute_finished, - is_test=self.is_test, - return_length=self.return_length, - **kwargs) - - -class Conv1dPoolLayer(Layer): - """ - This interface is used to construct a callable object of the ``Conv1DPoolLayer`` - class. The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` . - For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates - the output based on the input, filter and strides, paddings, dilations, groups, - global_pooling, pool_type, ceil_mode, exclusive parameters. - - Parameters: - num_channels (int): The number of channels in the input data. - num_filters(int): The number of filters. It is the same as the output channels. - filter_size (int): The filter size of Conv1DPoolLayer. - pool_size (int): The pooling size of Conv1DPoolLayer. - conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer. - Default: 1 - pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer. - Default: 1 - conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer. - Default: 0 - pool_padding (int): The padding of pool layer in Conv1DPoolLayer. - Default: 0 - act (str): Activation type for conv layer, if it is set to None, activation - is not appended. Default: None. - pool_type (str): Pooling type can be `max` for max-pooling or `avg` for - average-pooling. Default: `max` - dilation (int): The dilation size of the conv Layer. Default: 1. - groups (int): The groups number of the conv Layer. According to grouped - convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the - first half of the filters is only connected to the first half of the - input channels, while the second half of the filters is only connected - to the second half of the input channels. Default: 1. - global_pooling (bool): Whether to use the global pooling. If it is true, - `pool_size` and `pool_padding` would be ignored. Default: False - ceil_mode (bool, optional): Whether to use the ceil function to calculate output - height and width.False is the default. If it is set to False, the floor function - will be used. Default: False. - exclusive (bool, optional): Whether to exclude padding points in average pooling mode. - Default: True. - use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: False - param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights - of conv2d. If it is set to None or one attribute of ParamAttr, conv2d - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with :math:`Normal(0.0, std)`, - and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv2d - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - - Example: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import Conv1dPoolLayer - - # input: [batch_size, num_channels, sequence_length] - input = paddle.rand((2, 32, 4)) - cov2d = Conv1dPoolLayer(num_channels=32, - num_filters=64, - filter_size=2, - pool_size=2) - output = cov2d(input) - """ - - def __init__(self, - num_channels, - num_filters, - filter_size, - pool_size, - conv_stride=1, - pool_stride=1, - conv_padding=0, - pool_padding=0, - act=None, - pool_type='max', - global_pooling=False, - dilation=1, - groups=None, - ceil_mode=False, - exclusive=True, - use_cudnn=False, - param_attr=None, - bias_attr=None): - super(Conv1dPoolLayer, self).__init__() - self._conv2d = Conv2D( - num_channels=num_channels, - num_filters=num_filters, - filter_size=[filter_size, 1], - stride=[conv_stride, 1], - padding=[conv_padding, 0], - dilation=[dilation, 1], - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act) - self._pool2d = Pool2D( - pool_size=[pool_size, 1], - pool_type=pool_type, - pool_stride=[pool_stride, 1], - pool_padding=[pool_padding, 0], - global_pooling=global_pooling, - use_cudnn=use_cudnn, - ceil_mode=ceil_mode, - exclusive=exclusive) - - def forward(self, input): - """ - Performs conv1d and pool1d on the input. - - Parameters: - input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H - representing `batch_size`, `num_channels` and `sequence_length` - separately. data type can be float32 or float64 - - Returns: - Variable: The 3-D output tensor after conv and pool. It has the same \ - data type as input. - """ - x = fluid.layers.unsqueeze(input, axes=[-1]) - x = self._conv2d(x) - x = self._pool2d(x) - x = fluid.layers.squeeze(x, axes=[-1]) - return x - - -class CNNEncoder(Layer): - """ - This interface is used to construct a callable object of the ``CNNEncoder`` - class. The ``CNNEncoder`` is composed of multiple ``Conv1dPoolLayer`` . - ``CNNEncoder`` can define every Conv1dPoolLayer with different or same parameters. - The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel. The results of every - ``Conv1dPoolLayer`` will concat at the channel dimension as the final output. - - Parameters: - num_channels(int|list|tuple): The number of channels in the input data. If - `num_channels` is a list or tuple, the length of `num_channels` must - equal to `num_layers`. If `num_channels` is a int, all conv1dpoollayer's - `num_channels` are the value of `num_channels`. - num_filters(int|list|tuple): The number of filters. It is the same as the - output channels. If `num_filters` is a list or tuple, the length of - `num_filters` must equal `num_layers`. If `num_filters` is a int, - all conv1dpoollayer's `num_filters` are the value of `num_filters`. - filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder. - If `filter_size` is a list or tuple, the length of `filter_size` must - equal `num_layers`. If `filter_size` is a int, all conv1dpoollayer's - `filter_size` are the value of `filter_size`. - pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder. - If `pool_size` is a list or tuple, the length of `pool_size` must equal - `num_layers`. If `pool_size` is a int, all conv1dpoollayer's `pool_size` - are the value of `pool_size`. - num_layers(int): The number of conv1dpoolLayer used in CNNEncoder. - conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer. - If `conv_stride` is a list or tuple, the length of `conv_stride` must - equal `num_layers`. If conv_stride is a int, all conv1dpoollayer's `conv_stride` - are the value of `conv_stride`. Default: 1 - pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer. - If `pool_stride` is a list or tuple, the length of `pool_stride` must - equal `num_layers`. If `pool_stride` is a int, all conv1dpoollayer's `pool_stride` - are the value of `pool_stride`. Default: 1 - conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer. - If `conv_padding` is a list or tuple, the length of `conv_padding` must - equal `num_layers`. If `conv_padding` is a int, all conv1dpoollayer's `conv_padding` - are the value of `conv_padding`. Default: 0 - pool_padding(int|list|tuple): The padding size of pool layer in Conv1DPoolLayer. - If `pool_padding` is a list or tuple, the length of `pool_padding` must - equal `num_layers`.If `pool_padding` is a int, all conv1dpoollayer's `pool_padding` - are the value of `pool_padding`. Default: 0 - act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None, - activation is not appended. Default: None. - pool_type (str): Pooling type can be `max` for max-pooling or `avg` for - average-pooling. Default: `max` - global_pooling (bool): Whether to use the global pooling. If it is true, - `pool_size` and `pool_padding` would be ignored. Default: False - use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: False - - Example: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import CNNEncoder - - # input: [batch_size, num_channels, sequence_length] - input = paddle.rand((2, 32, 8)) - cov_encoder = CNNEncoder(num_layers=2, - num_channels=32, - num_filters=64, - filter_size=[2, 3], - pool_size=[7, 6]) - output = cov_encoder(input) # [2, 128, 1] - """ - - def __init__(self, - num_channels, - num_filters, - filter_size, - pool_size, - num_layers=1, - conv_stride=1, - pool_stride=1, - conv_padding=0, - pool_padding=0, - act=None, - pool_type='max', - global_pooling=False, - use_cudnn=False): - super(CNNEncoder, self).__init__() - self.num_layers = num_layers - self.num_channels = num_channels - self.num_filters = num_filters - self.filter_size = filter_size - self.pool_size = pool_size - self.conv_stride = conv_stride - self.pool_stride = pool_stride - self.conv_padding = conv_padding - self.pool_padding = pool_padding - self.use_cudnn = use_cudnn - self.act = act - self.pool_type = pool_type - self.global_pooling = global_pooling - self.conv1d_pool_layers = fluid.dygraph.LayerList([ - Conv1dPoolLayer( - num_channels=self.num_channels - if isinstance(self.num_channels, int) else self.num_channels[i], - num_filters=self.num_filters - if isinstance(self.num_channels, int) else self.num_filters[i], - filter_size=self.filter_size - if isinstance(self.filter_size, int) else self.filter_size[i], - pool_size=self.pool_size - if isinstance(self.pool_size, int) else self.pool_size[i], - conv_stride=self.conv_stride - if isinstance(self.conv_stride, int) else self.conv_stride[i], - pool_stride=self.pool_stride - if isinstance(self.pool_stride, int) else self.pool_stride[i], - conv_padding=self.conv_padding - if isinstance(self.conv_padding, int) else self.conv_padding[i], - pool_padding=self.pool_padding - if isinstance(self.pool_padding, int) else self.pool_padding[i], - act=self.act[i] - if isinstance(self.act, (list, tuple)) else self.act, - pool_type=self.pool_type, - global_pooling=self.global_pooling, - use_cudnn=self.use_cudnn) for i in range(num_layers) - ]) - - def forward(self, input): - """ - Performs multiple parallel conv1d and pool1d, and concat the results of - them at the channel dimension to produce the final output. - - Parameters: - input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H - representing `batch_size`, `num_channels` and `sequence_length` - separately. data type can be float32 or float64 - - Returns: - Variable: The 3-D output tensor produced by concatenating results of \ - all Conv1dPoolLayer. It has the same data type as input. - """ - res = [ - conv1d_pool_layer(input) - for conv1d_pool_layer in self.conv1d_pool_layers - ] - out = fluid.layers.concat(input=res, axis=1) - return out - - -class TransformerCell(RNNCell): - """ - TransformerCell wraps a Transformer decoder producing logits from `inputs` - composed by ids and position. - - Parameters: - decoder(callable): A TransformerDecoder instance. Or a wrapper of it that - includes a embedding layer accepting ids and positions instead of embeddings - and includes a output layer transforming decoder output features to logits. - embedding_fn(function, optional): A callable that accepts ids and position - as arguments and return embeddings as input of `decoder`. It can be - None if `decoder` includes a embedding layer. Default None. - output_fn(callable, optional): A callable applid on `decoder` output to - transform decoder output features to get logits. Mostly it is a Linear - layer with vocabulary size. It can be None if `decoder` includes a - output layer. Default None. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.fluid.dygraph import Embedding, Linear - from paddle.text import TransformerDecoder - from paddle.text import TransformerCell - from paddle.text import TransformerBeamSearchDecoder - from paddle.text import DynamicDecode - - paddle.disable_static() - - class Embedder(fluid.dygraph.Layer): - def __init__(self): - super(Embedder, self).__init__() - self.word_embedder = Embedding(size=[1000, 128]) - self.pos_embedder = Embedding(size=[500, 128]) - - def forward(self, word, position): - return self.word_embedder(word) + self.pos_embedder(position) - - embedder = Embedder() - output_layer = Linear(128, 1000) - decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) - transformer_cell = TransformerCell(decoder, embedder, output_layer) - dynamic_decoder = DynamicDecode( - TransformerBeamSearchDecoder( - transformer_cell, - start_token=0, - end_token=1, - beam_size=4, - var_dim_in_state=2), - max_step_num=10, - is_test=True) - - enc_output = paddle.rand((2, 4, 128)) - # cross attention bias: [batch_size, n_head, trg_len, src_len] - trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) - # inputs for beam search on Transformer - caches = transformer_cell.get_initial_states(enc_output) - enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - enc_output, beam_size=4) - trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, beam_size=4) - static_caches = decoder.prepare_static_cache(enc_output) - outputs = dynamic_decoder( - inits=caches, - enc_output=enc_output, - trg_src_attn_bias=trg_src_attn_bias, - static_caches=static_caches) - """ - - def __init__(self, decoder, embedding_fn=None, output_fn=None): - super(TransformerCell, self).__init__() - self.decoder = decoder - self.embedding_fn = embedding_fn - self.output_fn = output_fn - - def forward(self, - inputs, - states=None, - enc_output=None, - trg_slf_attn_bias=None, - trg_src_attn_bias=None, - static_caches=[]): - """ - Produces logits from `inputs` composed by ids and positions. - - Parameters: - inputs(tuple): A tuple includes target ids and positions. The two - tensors both have int64 data type and with 2D shape - `[batch_size, sequence_length]` where `sequence_length` is 1 - for inference. - states(list): It caches the multi-head attention intermediate results - of history decoding steps. It is a list of dict where the length - of list is decoder layer number, and each dict has `k` and `v` as - keys and values are cached results. Default None - enc_output(Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, sequence_length, d_model]`. The data type - should be float32 or float64. - trg_slf_attn_bias(Variable, optional): A tensor used in decoder self - attention to mask out attention on unwanted target positions. It - is a tensor with shape `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. It can be None when nothing wanted or needed to - be masked out. It can be None for inference. The data type should - be float32 or float64. Default None - trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder - cross attention to mask out unwanted attention on source (encoder output). - It is a tensor with shape `[batch_size, n_head, target_length, source_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. It can be None when nothing wanted or needed to - be masked out. The data type should be float32 or float64. Default None - static_caches(list): It stores projected results of encoder output - to be used as keys and values in decoder-encoder cross attention - It is a list of dict where the length of list is decoder layer - number, and each dict has `static_k` and `static_v` as keys and - values are stored results. Default empty list - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \ - is a float32 or float64 3D tensor representing logits shaped \ - `[batch_size, sequence_length, vocab_size]`. `new_states has \ - the same structure and data type with `states` while the length \ - is one larger since the intermediate results of current step are \ - concatenated into it. - """ - trg_word, trg_pos = inputs - if states and static_caches: - for cache, static_cache in zip(states, static_caches): - cache.update(static_cache) - if self.embedding_fn is not None: - dec_input = self.embedding_fn(trg_word, trg_pos) - outputs = self.decoder(dec_input, enc_output, None, - trg_src_attn_bias, states) - else: - outputs = self.decoder(trg_word, trg_pos, enc_output, None, - trg_src_attn_bias, states) - if self.output_fn is not None: - outputs = self.output_fn(outputs) - - new_states = [{ - "k": cache["k"], - "v": cache["v"] - } for cache in states] if states else states - return outputs, new_states - - @property - def state_shape(self): - """ - States of TransformerCell cache the multi-head attention intermediate - results of history decoding steps, and have a increasing length as - decoding continued. - - `state_shape` of TransformerCell is used to initialize states. It is a - list of dict where the length of list is decoder layer, and each dict - has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]` - separately. (-1 for batch size would be automatically inserted into shape). - - Returns: - list: It is a list of dict where the length of list is decoder layer \ - number, and each dict has `k` and `v` as keys and values are cached \ - results. - """ - return [{ - "k": [self.decoder.n_head, 0, self.decoder.d_key], - "v": [self.decoder.n_head, 0, self.decoder.d_value], - } for i in range(self.decoder.n_layer)] - - -class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): - """ - Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`, - Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]` - and includes extra position data. And its `states` (caches) has increasing - length. These are not consistent with `BeamSearchDecoder`, thus subclass - `BeamSearchDecoder` to make beam search adapt to Transformer decoder. - - Parameters: - cell(TransformerCell): An instance of `TransformerCell`. - start_token(int): The start token id. - end_token(int): The end token id. - beam_size(int): The beam width used in beam search. - var_dim_in_state(int): Indicate which dimension of states is variant. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.fluid.dygraph import Embedding, Linear - from paddle.text import TransformerDecoder - from paddle.text import TransformerCell - from paddle.text import TransformerBeamSearchDecoder - from paddle.text import DynamicDecode - - paddle.disable_static() - - class Embedder(fluid.dygraph.Layer): - def __init__(self): - super(Embedder, self).__init__() - self.word_embedder = Embedding(size=[1000, 128]) - self.pos_embedder = Embedding(size=[500, 128]) - - def forward(self, word, position): - return self.word_embedder(word) + self.pos_embedder(position) - - embedder = Embedder() - output_layer = Linear(128, 1000) - decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) - transformer_cell = TransformerCell(decoder, embedder, output_layer) - dynamic_decoder = DynamicDecode( - TransformerBeamSearchDecoder( - transformer_cell, - start_token=0, - end_token=1, - beam_size=4, - var_dim_in_state=2), - max_step_num=10, - is_test=True) - - enc_output = paddle.rand((2, 4, 128)) - # cross attention bias: [batch_size, n_head, trg_len, src_len] - trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) - # inputs for beam search on Transformer - caches = transformer_cell.get_initial_states(enc_output) - enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - enc_output, beam_size=4) - trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, beam_size=4) - static_caches = decoder.prepare_static_cache(enc_output) - outputs = dynamic_decoder( - inits=caches, - enc_output=enc_output, - trg_src_attn_bias=trg_src_attn_bias, - static_caches=static_caches) - """ - - def __init__(self, cell, start_token, end_token, beam_size, - var_dim_in_state): - super(TransformerBeamSearchDecoder, - self).__init__(cell, start_token, end_token, beam_size) - self.cell = cell - self.var_dim_in_state = var_dim_in_state - - def _merge_batch_beams_with_var_dim(self, x): - """ - Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new - tensor with shape `[batch_size * beam_size, ...]`. - - Parameters: - x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The - data type should be float32, float64, int32, int64 or bool. - - Returns: - Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \ - data type is same as `x`. - """ - # init length of cache is 0, and it increases with decoding carrying on, - # thus need to reshape elaborately - var_dim_in_state = self.var_dim_in_state + 1 # count in beam dim - x = layers.transpose(x, - list(range(var_dim_in_state, len(x.shape))) + - list(range(0, var_dim_in_state))) - x = layers.reshape( - x, [0] * (len(x.shape) - var_dim_in_state - ) + [self.batch_size * self.beam_size] + - [int(size) for size in x.shape[-var_dim_in_state + 2:]]) - x = layers.transpose( - x, - list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) + - list(range(0, (len(x.shape) + 1 - var_dim_in_state)))) - return x - - def _split_batch_beams_with_var_dim(self, x): - """ - Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new - tensor with shape `[batch_size, beam_size, ...]`. - - Parameters: - x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The - data type should be float32, float64, int32, int64 or bool. - - Returns: - Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \ - data type is same as `x`. - """ - var_dim_size = layers.shape(x)[self.var_dim_in_state] - x = layers.reshape( - x, [-1, self.beam_size] + - [int(size) - for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] + - [int(size) for size in x.shape[self.var_dim_in_state + 1:]]) - return x - - def step(self, time, inputs, states, **kwargs): - """ - Perform a beam search decoding step, which uses `cell` to get probabilities, - and follows a beam search step to calculate scores and select candidate - token ids. - - Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped - `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined - position data as inputs to `cell`. - - Parameters: - time(Variable): An `int64` tensor with shape `[1]` provided by the caller, - representing the current time step number of decoding. - inputs(Variable): A tensor variable. It is same as `initial_inputs` - returned by `initialize()` for the first decoding step and - `next_inputs` returned by `step()` for the others. It is a int64 - id tensor with shape `[batch_size * beam_size]` - states(Variable): A structure of tensor variables. - It is same as the `initial_states` returned by `initialize()` for - the first decoding step and `beam_search_state` returned by - `step()` for the others. - **kwargs: Additional keyword arguments, provided by the caller. - - Returns: - tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \ - `beam_search_state` and `next_inputs` have the same structure, \ - shape and data type as the input arguments `states` and `inputs` separately. \ - `beam_search_output` is a namedtuple(including scores, predicted_ids, \ - parent_ids as fields) of tensor variables, where \ - `scores, predicted_ids, parent_ids` all has a tensor value shaped \ - `[batch_size, beam_size]` with data type `float32, int64, int64`. \ - `finished` is a `bool` tensor with shape `[batch_size, beam_size]`. - """ - # compared to RNN, Transformer has 3D data at every decoding step - inputs = layers.reshape(inputs, [-1, 1]) # token - pos = layers.ones_like(inputs) * time # pos - cell_states = map_structure(self._merge_batch_beams_with_var_dim, - states.cell_states) - - cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states, - **kwargs) - - # squeeze to adapt to BeamSearchDecoder which use 2D logits - cell_outputs = map_structure( - lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x, - cell_outputs) - cell_outputs = map_structure(self._split_batch_beams, cell_outputs) - next_cell_states = map_structure(self._split_batch_beams_with_var_dim, - next_cell_states) - - beam_search_output, beam_search_state = self._beam_search_step( - time=time, - logits=cell_outputs, - next_cell_states=next_cell_states, - beam_state=states) - next_inputs, finished = (beam_search_output.predicted_ids, - beam_search_state.finished) - - return (beam_search_output, beam_search_state, next_inputs, finished) - - -### Transformer Modules ### -class PrePostProcessLayer(Layer): - """ - PrePostProcessLayer is used before/after each multi-head attention(MHA) and - feed-forward network(FFN) sub-layer to perform some specific process on - inputs/outputs. - - Parameters: - process_cmd (str): The process applied before/after each MHA and - FFN sub-layer. It should be a string composed of `d`, `a`, `n`, - where `d` for dropout, `a` for add residual connection, `n` for - layer normalization. - d_model (int): The expected feature size in the input and output. - dropout_rate (float): The dropout probability if the process includes - dropout. Default 0.1 - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import PrePostProcessLayer - - # input: [batch_size, sequence_length, d_model] - x = paddle.rand((2, 4, 32)) - process = PrePostProcessLayer('n', 32) - out = process(x) # [2, 4, 32] - """ - - def __init__(self, process_cmd, d_model, dropout_rate=0.1): - super(PrePostProcessLayer, self).__init__() - self.process_cmd = process_cmd - self.functors = [] - for cmd in self.process_cmd: - if cmd == "a": # add residual connection - self.functors.append(lambda x, y: x + y if y is not None else x) - elif cmd == "n": # add layer normalization - layer_norm = LayerNorm( - normalized_shape=d_model, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(0.))) - - self.functors.append( - self.add_sublayer( - "layer_norm_%d" % len( - self.sublayers(include_sublayers=False)), - layer_norm)) - elif cmd == "d": # add dropout - self.functors.append(lambda x: layers.dropout( - x, dropout_prob=dropout_rate, is_test=False) - if dropout_rate else x) - - def forward(self, x, residual=None): - """ - Applies `process_cmd` specified process on `x`. - - Parameters: - x (Variable): The tensor to be processed. The data type should be float32 - or float64. The shape is `[batch_size, sequence_length, d_model]`. - - residual (Variable, optional): Only used if the process includes - residual connection. It has the same shape and data type as `x`. - Default None - - Returns: - Variable: The processed tensor. It has the same shape and data type \ - as `x`. - """ - for i, cmd in enumerate(self.process_cmd): - if cmd == "a": - x = self.functors[i](x, residual) - else: - x = self.functors[i](x) - return x - - -class MultiHeadAttention(Layer): - """ - MultiHead Attention mapps queries and a set of key-value pairs to outputs - by jointly attending to information from different representation subspaces, - as what multi-head indicates it performs multiple attention in parallel. - - Please refer to `Attention Is All You Need `_ - for more details. - - Parameters: - d_key (int): The feature size to transformer queries and keys as in - multi-head attention. Mostly it equals to `d_model // n_head`. - d_value (int): The feature size to transformer values as in multi-head - attention. Mostly it equals to `d_model // n_head`. - d_model (int): The expected feature size in the input and output. - n_head (int): The number of heads in multi-head attention(MHA). - dropout_rate (float, optional): The dropout probability used in MHA to - drop some attention target. Default 0.1 - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import MultiHeadAttention - - # encoder input: [batch_size, sequence_length, d_model] - query = paddle.rand((2, 4, 128)) - # self attention bias: [batch_size, n_head, src_len, src_len] - attn_bias = paddle.rand((2, 2, 4, 4)) - multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2) - output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128] - """ - - def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1): - - super(MultiHeadAttention, self).__init__() - self.n_head = n_head - self.d_key = d_key - self.d_value = d_value - self.d_model = d_model - self.dropout_rate = dropout_rate - - self.q_fc = Linear( - input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) - self.k_fc = Linear( - input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) - self.v_fc = Linear( - input_dim=d_model, output_dim=d_value * n_head, bias_attr=False) - self.proj_fc = Linear( - input_dim=d_value * n_head, output_dim=d_model, bias_attr=False) - - def _prepare_qkv(self, queries, keys, values, cache=None): - """ - Prapares linear projected queries, keys and values for usage of subsequnt - multiple attention in parallel. If `cache` is not None, using cached - results to reduce redundant calculations. - - Parameters: - queries (Variable): The queries for multi-head attention. It is a - tensor with shape `[batch_size, sequence_length, d_model]`. The - data type should be float32 or float64. - keys (Variable, optional): The keys for multi-head attention. It is - a tensor with shape `[batch_size, sequence_length, d_model]`. The - data type should be float32 or float64. - values (Variable, optional): The values for multi-head attention. It - is a tensor with shape `[batch_size, sequence_length, d_model]`. - The data type should be float32 or float64. - cache(dict, optional): It is a dict with `k` and `v` as keys, and - values cache the multi-head attention intermediate results of - history decoding steps for decoder self attention; Or a dict - with `static_k` and `statkc_v` as keys, and values stores intermediate - results of encoder output for decoder-encoder cross attention. - If it is for decoder self attention, values for `k` and `v` would - be updated by new tensors concatanating raw tensors with intermediate - results of current step. It is only used for inference and should - be None for training. Default None - - Returns: - tuple: A tuple including linear projected keys and values. These two \ - tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ - and `[batch_size, n_head, sequence_length, d_value]` separately, \ - and their data types are same as inputs. - """ - if keys is None: # self-attention - keys, values = queries, queries - static_kv = False - else: # cross-attention - static_kv = True - - q = self.q_fc(queries) - q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) - q = layers.transpose(x=q, perm=[0, 2, 1, 3]) - - if cache is not None and static_kv and "static_k" in cache: - # for encoder-decoder attention in inference and has cached - k = cache["static_k"] - v = cache["static_v"] - else: - k = self.k_fc(keys) - v = self.v_fc(values) - k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) - k = layers.transpose(x=k, perm=[0, 2, 1, 3]) - v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) - v = layers.transpose(x=v, perm=[0, 2, 1, 3]) - - if cache is not None: - if static_kv and not "static_k" in cache: - # for encoder-decoder attention in inference and has not cached - cache["static_k"], cache["static_v"] = k, v - elif not static_kv: - # for decoder self-attention in inference - cache_k, cache_v = cache["k"], cache["v"] - k = layers.concat([cache_k, k], axis=2) - v = layers.concat([cache_v, v], axis=2) - cache["k"], cache["v"] = k, v - - return q, k, v - - def forward(self, - queries, - keys=None, - values=None, - attn_bias=None, - cache=None): - """ - Applies multi-head attention to map queries and a set of key-value pairs - to outputs. - - Parameters: - queries (Variable): The queries for multi-head attention. It is a - tensor with shape `[batch_size, sequence_length, d_model]`. The - data type should be float32 or float64. - keys (Variable, optional): The keys for multi-head attention. It is - a tensor with shape `[batch_size, sequence_length, d_model]`. The - data type should be float32 or float64. - values (Variable, optional): The values for multi-head attention. It - is a tensor with shape `[batch_size, sequence_length, d_model]`. - The data type should be float32 or float64. - attn_bias (Variable, optional): A tensor used in multi-head attention - to mask out attention on unwanted positions, usually the - paddings or the subsequent positions. It is a tensor with shape - `[batch_size, n_head, sequence_length, sequence_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - cache(dict, optional): It is a dict with `k` and `v` as keys, and - values cache the multi-head attention intermediate results of - history decoding steps for decoder self attention; Or a dict - with `static_k` and `statkc_v` as keys, and values stores intermediate - results of encoder output for decoder-encoder cross attention. - If it is for decoder self attention, values for `k` and `v` would - be updated by new tensors concatanating raw tensors with intermediate - results of current step. It is only used for inference and should - be None for training. Default None - - Returns: - Variable: The output of multi-head attention. It is a tensor \ - that has the same shape and data type as `queries`. - """ - # compute q ,k ,v - q, k, v = self._prepare_qkv(queries, keys, values, cache) - - # scale dot product attention - product = layers.matmul( - x=q, y=k, transpose_y=True, alpha=self.d_key**-0.5) - if attn_bias is not None: - product += attn_bias - weights = layers.softmax(product) - if self.dropout_rate: - weights = layers.dropout( - weights, dropout_prob=self.dropout_rate, is_test=False) - - out = layers.matmul(weights, v) - - # combine heads - out = layers.transpose(out, perm=[0, 2, 1, 3]) - out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) - - # project to output - out = self.proj_fc(out) - return out - - def cal_kv(self, keys, values): - """ - Applies linear projection on input keys and values, then splits heads - (reshape and transpose) to get keys and values from different representation - subspaces for usage of subsequnt multiple attention in parallel. - - Parameters: - keys (Variable, optional): The keys for multi-head attention. It is - a tensor with shape `[batch_size, sequence_length, d_model]`. The - data type should be float32 or float64. - values (Variable, optional): The values for multi-head attention. It - is a tensor with shape `[batch_size, sequence_length, d_model]`. - The data type should be float32 or float64. - - Returns: - tuple: A tuple including linear projected keys and values. These two \ - tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ - and `[batch_size, n_head, sequence_length, d_value]` separately, \ - and their data types are same as inputs. - """ - k = self.k_fc(keys) - v = self.v_fc(values) - k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) - k = layers.transpose(x=k, perm=[0, 2, 1, 3]) - v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) - v = layers.transpose(x=v, perm=[0, 2, 1, 3]) - return k, v - - -class FFN(Layer): - """ - A fully connected feed-forward network applied to each position separately - and identically. This consists of two linear transformations with a activation - and dropout in between. - - Parameters: - d_inner_hid (int): The hidden size in the feedforward network(FFN). - d_model (int): The expected feature size in the input and output. - dropout_rate (float, optional): The dropout probability used after - activition. Default 0.1 - ffn_fc1_act (str, optional): The activation function in the feedforward - network. Default relu. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import FFN - - # input: [batch_size, sequence_length, d_model] - x = paddle.rand((2, 4, 32)) - ffn = FFN(128, 32) - out = ffn(x) # [2, 4, 32] - """ - - def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"): - super(FFN, self).__init__() - self.dropout_rate = dropout_rate - self.fc1 = Linear( - input_dim=d_model, output_dim=d_inner_hid, act=fc1_act) - self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model) - - def forward(self, x): - """ - Applies a fully connected feed-forward network on each position of the - input sequences separately and identically. - - Parameters: - x (Variable): The input of feed-forward network. It is a tensor - with shape `[batch_size, sequence_length, d_model]`. The data - type should be float32 or float64. - - Returns: - Variable: The output of feed-forward network. It is a tensor that has \ - the same shape and data type as `enc_input`. - """ - hidden = self.fc1(x) - if self.dropout_rate: - hidden = layers.dropout( - hidden, dropout_prob=self.dropout_rate, is_test=False) - out = self.fc2(hidden) - return out - - -class TransformerEncoderLayer(Layer): - """ - TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) - attention and feedforward network. Before and after each sub-layer, pre-process - and post-precess would be applied on the input and output. - - Parameters: - n_head (int): The number of heads in multi-head attention(MHA). - d_key (int): The feature size to transformer queries and keys as in - multi-head attention. Mostly it equals to `d_model // n_head`. - d_value (int): The feature size to transformer values as in multi-head - attention. Mostly it equals to `d_model // n_head`. - d_model (int): The expected feature size in the input and output. - d_inner_hid (int): The hidden layer size in the feedforward network(FFN). - prepostprocess_dropout (float, optional): The dropout probability used - in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 - attention_dropout (float, optional): The dropout probability used - in MHA to drop some attention target. Default 0.1 - relu_dropout (float, optional): The dropout probability used after FFN - activition. Default 0.1 - preprocess_cmd (str, optional): The process applied before each MHA and - FFN sub-layer, and it also would be applied on output of the last - stacked layer. It should be a string composed of `d`, `a`, `n`, - where `d` for dropout, `a` for add residual connection, `n` for - layer normalization. Default `n`. - postprocess_cmd (str, optional): The process applied after each MHA and - FFN sub-layer. Same as `preprocess_cmd`. It should be a string - composed of `d`, `a`, `n`, where `d` for dropout, `a` for add - residual connection, `n` for layer normalization. Default `da`. - ffn_fc1_act (str, optional): The activation function in the feedforward - network. Default relu. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import TransformerEncoderLayer - - # encoder input: [batch_size, src_len, d_model] - enc_input = paddle.rand((2, 4, 128)) - # self attention bias: [batch_size, n_head, src_len, src_len] - attn_bias = paddle.rand((2, 2, 4, 4)) - encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512) - enc_output = encoder_layer(enc_input, attn_bias) # [2, 4, 128] - """ - - def __init__(self, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - ffn_fc1_act="relu"): - - super(TransformerEncoderLayer, self).__init__() - - self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, - attention_dropout) - self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) - - self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act) - self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) - - def forward(self, enc_input, attn_bias=None): - """ - Applies a Transformer encoder layer on the input. - - Parameters: - enc_input (Variable): The input of Transformer encoder layer. It is - a tensor with shape `[batch_size, sequence_length, d_model]`. - The data type should be float32 or float64. - attn_bias(Variable, optional): A tensor used in encoder self attention - to mask out attention on unwanted positions, usually the paddings. It - is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - - Returns: - Variable: The output of Transformer encoder layer. It is a tensor that \ - has the same shape and data type as `enc_input`. - """ - attn_output = self.self_attn( - self.preprocesser1(enc_input), None, None, attn_bias) - attn_output = self.postprocesser1(attn_output, enc_input) - - ffn_output = self.ffn(self.preprocesser2(attn_output)) - ffn_output = self.postprocesser2(ffn_output, attn_output) - return ffn_output - - -class TransformerEncoder(Layer): - """ - TransformerEncoder is a stack of N encoder layers. - - Parameters: - n_layer (int): The number of encoder layers to be stacked. - n_head (int): The number of heads in multi-head attention(MHA). - d_key (int): The feature size to transformer queries and keys as in - multi-head attention. Mostly it equals to `d_model // n_head`. - d_value (int): The feature size to transformer values as in multi-head - attention. Mostly it equals to `d_model // n_head`. - d_model (int): The expected feature size in the input and output. - d_inner_hid (int): The hidden layer size in the feedforward network(FFN). - prepostprocess_dropout (float, optional): The dropout probability used - in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 - attention_dropout (float, optional): The dropout probability used - in MHA to drop some attention target. Default 0.1 - relu_dropout (float, optional): The dropout probability used after FFN - activition. Default 0.1 - preprocess_cmd (str, optional): The process applied before each MHA and - FFN sub-layer, and it also would be applied on output of the last - stacked layer. It should be a string composed of `d`, `a`, `n`, - where `d` for dropout, `a` for add residual connection, `n` for - layer normalization. Default `n`. - postprocess_cmd (str, optional): The process applied after each MHA and - FFN sub-layer. Same as `preprocess_cmd`. It should be a string - composed of `d`, `a`, `n`, where `d` for dropout, `a` for add - residual connection, `n` for layer normalization. Default `da`. - ffn_fc1_act (str, optional): The activation function in the feedforward - network. Default relu. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import TransformerEncoder - - # encoder input: [batch_size, src_len, d_model] - enc_input = paddle.rand((2, 4, 128)) - # self attention bias: [batch_size, n_head, src_len, src_len] - attn_bias = paddle.rand((2, 2, 4, 4)) - encoder = TransformerEncoder(2, 2, 64, 64, 128, 512) - enc_output = encoder(enc_input, attn_bias) # [2, 4, 128] - """ - - def __init__(self, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - ffn_fc1_act="relu"): - - super(TransformerEncoder, self).__init__() - - self.encoder_layers = list() - for i in range(n_layer): - self.encoder_layers.append( - self.add_sublayer( - "layer_%d" % i, - TransformerEncoderLayer( - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - ffn_fc1_act=ffn_fc1_act))) - self.processer = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - - def forward(self, enc_input, attn_bias=None): - """ - Applies a stack of N Transformer encoder layers on input sequences. - - Parameters: - enc_input (Variable): The input of Transformer encoder. It is a tensor - with shape `[batch_size, sequence_length, d_model]`. The data - type should be float32 or float64. - attn_bias(Variable, optional): A tensor used in encoder self attention - to mask out attention on unwanted positions, usually the paddings. It - is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - - Returns: - Variable: The output of Transformer encoder. It is a tensor that has \ - the same shape and data type as `enc_input`. - """ - for encoder_layer in self.encoder_layers: - enc_output = encoder_layer(enc_input, attn_bias) - enc_input = enc_output - - return self.processer(enc_output) - - -class TransformerDecoderLayer(Layer): - """ - TransformerDecoderLayer is composed of three sub-layers which are decoder - self (multi-head) attention, decoder-encoder cross attention and feedforward - network. Before and after each sub-layer, pre-process and post-precess would - be applied on the input and output. - - Parameters: - n_head (int): The number of heads in multi-head attention(MHA). - d_key (int): The feature size to transformer queries and keys as in - multi-head attention. Mostly it equals to `d_model // n_head`. - d_value (int): The feature size to transformer values as in multi-head - attention. Mostly it equals to `d_model // n_head`. - d_model (int): The expected feature size in the input and output. - d_inner_hid (int): The hidden layer size in the feedforward network(FFN). - prepostprocess_dropout (float, optional): The dropout probability used - in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 - attention_dropout (float, optional): The dropout probability used - in MHA to drop some attention target. Default 0.1 - relu_dropout (float, optional): The dropout probability used after FFN - activition. Default 0.1 - preprocess_cmd (str, optional): The process applied before each MHA and - FFN sub-layer, and it also would be applied on output of the last - stacked layer. It should be a string composed of `d`, `a`, `n`, - where `d` for dropout, `a` for add residual connection, `n` for - layer normalization. Default `n`. - postprocess_cmd (str, optional): The process applied after each MHA and - FFN sub-layer. Same as `preprocess_cmd`. It should be a string - composed of `d`, `a`, `n`, where `d` for dropout, `a` for add - residual connection, `n` for layer normalization. Default `da`. - ffn_fc1_act (str, optional): The activation function in the feedforward - network. Default relu. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import TransformerDecoderLayer - - # decoder input: [batch_size, trg_len, d_model] - dec_input = paddle.rand((2, 4, 128)) - # encoder output: [batch_size, src_len, d_model] - enc_output = paddle.rand((2, 6, 128)) - # self attention bias: [batch_size, n_head, trg_len, trg_len] - self_attn_bias = paddle.rand((2, 2, 4, 4)) - # cross attention bias: [batch_size, n_head, trg_len, src_len] - cross_attn_bias = paddle.rand((2, 2, 4, 6)) - decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512) - output = decoder_layer(dec_input, - enc_output, - self_attn_bias, - cross_attn_bias) # [2, 4, 128] - """ - - def __init__(self, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - ffn_fc1_act="relu"): - super(TransformerDecoderLayer, self).__init__() - - self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, - attention_dropout) - self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) - - self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, - attention_dropout) - self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) - - self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act) - self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) - - def forward(self, - dec_input, - enc_output, - self_attn_bias=None, - cross_attn_bias=None, - cache=None): - """ - Applies a Transformer decoder layer on the input. - - Parameters: - dec_input (Variable): The input of Transformer decoder. It is a tensor - with shape `[batch_size, target_length, d_model]`. The data type - should be float32 or float64. - enc_output (Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, source_length, d_model]`. The data type - should be float32 or float64. - self_attn_bias (Variable, optional): A tensor used in decoder self attention - to mask out attention on unwanted positions, usually the subsequent positions. - It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross - attention to mask out attention on unwanted positions, usually the paddings. - It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - caches(dict, optional): It caches the multi-head attention intermediate - results of history decoding steps and encoder output. It is a dict - has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached - results. It is only used for inference and should be None for - training. Default None - - Returns: - Variable: The output of Transformer decoder layer. It is a tensor \ - that has the same shape and data type as `dec_input`. - """ - self_attn_output = self.self_attn( - self.preprocesser1(dec_input), None, None, self_attn_bias, cache) - self_attn_output = self.postprocesser1(self_attn_output, dec_input) - - cross_attn_output = self.cross_attn( - self.preprocesser2(self_attn_output), enc_output, enc_output, - cross_attn_bias, cache) - cross_attn_output = self.postprocesser2(cross_attn_output, - self_attn_output) - - ffn_output = self.ffn(self.preprocesser3(cross_attn_output)) - ffn_output = self.postprocesser3(ffn_output, cross_attn_output) - - return ffn_output - - -class TransformerDecoder(Layer): - """ - TransformerDecoder is a stack of N decoder layers. - - Parameters: - n_layer (int): The number of encoder layers to be stacked. - n_head (int): The number of heads in multi-head attention(MHA). - d_key (int): The feature size to transformer queries and keys as in - multi-head attention. Mostly it equals to `d_model // n_head`. - d_value (int): The feature size to transformer values as in multi-head - attention. Mostly it equals to `d_model // n_head`. - d_model (int): The expected feature size in the input and output. - d_inner_hid (int): The hidden layer size in the feedforward network(FFN). - prepostprocess_dropout (float, optional): The dropout probability used - in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 - attention_dropout (float, optional): The dropout probability used - in MHA to drop some attention target. Default 0.1 - relu_dropout (float, optional): The dropout probability used after FFN - activition. Default 0.1 - preprocess_cmd (str, optional): The process applied before each MHA and - FFN sub-layer, and it also would be applied on output of the last - stacked layer. It should be a string composed of `d`, `a`, `n`, - where `d` for dropout, `a` for add residual connection, `n` for - layer normalization. Default `n`. - postprocess_cmd (str, optional): The process applied after each MHA and - FFN sub-layer. Same as `preprocess_cmd`. It should be a string - composed of `d`, `a`, `n`, where `d` for dropout, `a` for add - residual connection, `n` for layer normalization. Default `da`. - ffn_fc1_act (str, optional): The activation function in the feedforward - network. Default relu. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.text import TransformerDecoder - - # decoder input: [batch_size, trg_len, d_model] - dec_input = paddle.rand((2, 4, 128)) - # encoder output: [batch_size, src_len, d_model] - enc_output = paddle.rand((2, 6, 128)) - # self attention bias: [batch_size, n_head, trg_len, trg_len] - self_attn_bias = paddle.rand((2, 2, 4, 4)) - # cross attention bias: [batch_size, n_head, trg_len, src_len] - cross_attn_bias = paddle.rand((2, 2, 4, 6)) - decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) - dec_output = decoder(dec_input, - enc_output, - self_attn_bias, - cross_attn_bias) # [2, 4, 128] - """ - - def __init__(self, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout=0.1, - attention_dropout=0.1, - relu_dropout=0.1, - preprocess_cmd="n", - postprocess_cmd="da", - ffn_fc1_act="relu"): - super(TransformerDecoder, self).__init__() - - self.n_layer = n_layer - self.n_head = n_head - self.d_key = d_key - self.d_value = d_value - - self.decoder_layers = list() - for i in range(n_layer): - self.decoder_layers.append( - self.add_sublayer( - "layer_%d" % i, - TransformerDecoderLayer(n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, - attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd))) - self.processer = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - - def forward(self, - dec_input, - enc_output, - self_attn_bias=None, - cross_attn_bias=None, - caches=None): - """ - Applies a stack of N Transformer decoder layers on inputs. - - Parameters: - dec_input (Variable): The input of Transformer decoder. It is a tensor - with shape `[batch_size, target_length, d_model]`. The data type - should be float32 or float64. - enc_output (Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, source_length, d_model]`. The data type - should be float32 or float64. - self_attn_bias (Variable, optional): A tensor used in decoder self attention - to mask out attention on unwanted positions, usually the subsequent positions. - It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross - attention to mask out attention on unwanted positions, usually the paddings. - It is a tensor with shape `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None - caches(list, optional): It caches the multi-head attention intermediate results - of history decoding steps and encoder output. It is a list of dict - where the length of list is decoder layer number, and each dict - has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached - results. It is only used for inference and should be None for - training. Default None - - Returns: - Variable: The output of Transformer decoder. It is a tensor that has \ - the same shape and data type as `dec_input`. - """ - for i, decoder_layer in enumerate(self.decoder_layers): - dec_output = decoder_layer(dec_input, enc_output, self_attn_bias, - cross_attn_bias, caches[i] - if caches else None) - dec_input = dec_output - - return self.processer(dec_output) - - def prepare_static_cache(self, enc_output): - """ - Generate a list of dict where the length of list is decoder layer number. - Each dict has `static_k`, `statkc_v` as keys, and values are projected - results of encoder output to be used as keys and values in decoder-encoder - cross (multi-head) attention. Used in inference. - - Parameters: - enc_output (Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, source_length, d_model]`. The data type - should be float32 or float64. - - Returns: - list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \ - and values are projected results of encoder output to be used as \ - keys and values in decoder-encoder cross (multi-head) attention. - """ - return [ - dict( - zip(("static_k", "static_v"), - decoder_layer.cross_attn.cal_kv(enc_output, enc_output))) - for decoder_layer in self.decoder_layers - ] - - def prepare_incremental_cache(self, enc_output): - """ - Generate a list of dict where the length of list is decoder layer number. - Each dict has `k`, `v` as keys, and values are empty tensors with shape - `[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`, - representing the decoder self (multi-head) attention intermediate results, - and 0 is the initial length which would increase as inference decoding - continued. Used in inference. - - Parameters: - enc_output (Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, source_length, d_model]`. The data type - should be float32 or float64. Actually, it is used to provide batch - size for Transformer initial states(caches), thus any tensor has - wanted batch size can be used here. - - Returns: - list: A list of dict. Each dict has `k`, `v` as keys, and values are \ - empty tensors representing intermediate results of history decoding \ - steps in decoder self (multi-head) attention at time step 0. - """ - return [{ - "k": layers.fill_constant_batch_size_like( - input=enc_output, - shape=[-1, self.n_head, 0, self.d_key], - dtype=enc_output.dtype, - value=0), - "v": layers.fill_constant_batch_size_like( - input=enc_output, - shape=[-1, self.n_head, 0, self.d_value], - dtype=enc_output.dtype, - value=0), - } for i in range(self.n_layer)] - - -class LinearChainCRF(Layer): - """ - Computes the negtive log-likelihood of tag sequences in a linear chain CRF. - Using terminologies of undirected probabilistic graph model, it calculates - probability using unary potentials (for emission) and binary potentials - (for transition). - - This layer creates a learnable parameter shaped `[size + 2, size]` (`size` - is for the number of tags), where: - - 1. the first row is for starting weights, denoted as $a$ here - - 2. the second row is for ending weights, denoted as $b$ here. - - 3. the remaining rows is a matrix for transition weights. - - Denote input tensor of unary potentials(emission) as $x$ , then the probability - of a tag sequence $s$ of length $L$ is defined as: - - $$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L} - + \sum_{l=1}^L x_{s_l} - + \sum_{l=2}^L w_{s_{l-1},s_l})$$ - - where $Z$ is a normalization value so that the sum of $P(s)$ over - all possible sequences is 1, and $x$ is the emission feature weight - to the linear chain CRF. - - This operator implements the Forward-Backward algorithm for the linear chain - CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and - http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details. - - NOTE: - - 1. The feature function for a CRF is made up of the emission features and the - transition features. The emission feature weights are NOT computed in - this operator. They MUST be computed first before this operator is called. - - 2. Because this operator performs global normalization over all possible - sequences internally, it expects UNSCALED emission feature weights. - Please do not call this op with the emission feature being output of any - nonlinear activation. - - 3. The 2nd dimension of input(emission) MUST be equal to the tag number. - - Parameters: - size (int): The number of tags. - param_attr (ParamAttr, optional): The attribute of the learnable parameter for - transition. Default: None - dtype (str, optional): Data type, it can be 'float32' or 'float64'. - Default: `float32` - - Examples: - - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - from paddle.text import LinearChainCRF - - # emission: [batch_size, sequence_length, num_tags] - emission = paddle.rand((2, 8, 5)) - # label: [batch_size, sequence_length, num_tags] - # dummy label just for example usage - label = paddle.ones((2, 8), dtype='int64') - length = fluid.layers.assign(np.array([6, 8]).astype('int64')) - crf = LinearChainCRF(size=5) - cost = crf(emission, label, length) # [2, 1] - """ - - def __init__(self, size, param_attr=None, dtype='float32'): - super(LinearChainCRF, self).__init__() - self._param_attr = param_attr - self._dtype = dtype - self._size = size - self._transition = self.create_parameter( - attr=self._param_attr, - shape=[self._size + 2, self._size], - dtype=self._dtype) - - @property - def weight(self): - """ - getter for transition matrix parameter - - Returns: - Parameter: The learnable transition parameter shaped `[size + 2, size]` \ - (`size` is for the number of tags). The data type should be float32 \ - or float64. - """ - return self._transition - - @weight.setter - def weight(self, value): - """ - setter for transition matrix parameter - - Parameters: - value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \ - (`size` is for the number of tags). The data type should be float32 \ - or float64. - """ - self._transition = value - - def forward(self, input, label, length): - """ - Computes the log-likelihood of tag sequences in a linear chain CRF. - - Parameters: - input (Variable): The input of unary potentials(emission). It is a - tensor with shape `[batch_size, sequence_length, num_tags]`. - The data type should be float32 or float64. - label (Variable): The golden sequence tags. It is a tensor - with shape `[batch_size, sequence_length]`. The data type - should be int64. - length (Variable): A tensor with shape `[batch_size]`. It stores real - length of each sequence for correctness. - - Returns: - Variable: The negtive log-likelihood of tag sequences. It is a tensor \ - with shape `[batch_size, 1]` and has float32 or float64 data type. - """ - alpha = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - emission_exps = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - transition_exps = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - log_likelihood = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - this_inputs = { - "Emission": [input], - "Transition": self._transition, - "Label": [label] - } - if length is not None: - this_inputs['Length'] = [length] - self._helper.append_op( - type='linear_chain_crf', - inputs=this_inputs, - outputs={ - "Alpha": [alpha], - "EmissionExps": [emission_exps], - "TransitionExps": transition_exps, - "LogLikelihood": log_likelihood - }) - return log_likelihood - - -class CRFDecoding(Layer): - """ - CRFDecoding reads the emission feature weights and the transition - feature weights learned by the `LinearChainCRF` and performs decoding. - It implements the Viterbi algorithm which is a dynamic programming algorithm - for finding the most likely sequence of hidden states, called the Viterbi path, - that results in a sequence of observed tags. - - The output of this layer changes according to whether `label` is given: - - 1. `label` is given: - - This happens in training. This operator is used to co-work with the chunk_eval - operator. When `label` is given, it returns tensor with the same shape as - `label` whose values are fixed to be 0, indicating an incorrect prediction, - or 1 indicating a tag is correctly predicted. Such an output is the input to - chunk_eval operator. - - 2. `label` is not given: - - This is the standard decoding process and get the highest scoring sequence - of tags. - - Parameters: - size (int): The number of tags. - param_attr (ParamAttr, optional): The attribute of the learnable parameter for - transition. Default: None - dtype (str, optional): Data type, it can be 'float32' or 'float64'. - Default: `float32` - - Examples: - - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - from paddle.text import CRFDecoding - - # emission: [batch_size, sequence_length, num_tags] - emission = paddle.rand((2, 8, 5)) - length = fluid.layers.assign(np.array([6, 8]).astype('int64')) - crf_decoding = CRFDecoding(size=5) - cost = crf_decoding(emission, length) # [2, 8] - """ - - def __init__(self, size, param_attr=None, dtype='float32'): - super(CRFDecoding, self).__init__() - self._dtype = dtype - self._size = size - self._param_attr = param_attr - self._transition = self.create_parameter( - attr=self._param_attr, - shape=[self._size + 2, self._size], - dtype=self._dtype) - - @property - def weight(self): - """ - getter for transition matrix parameter - - Returns: - Parameter: The learnable transition parameter shaped `[size + 2, size]` \ - (`size` is for the number of tags). The data type should be float32 \ - or float64. - """ - return self._transition - - @weight.setter - def weight(self, value): - """ - setter for transition matrix parameter - - Parameters: - value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \ - (`size` is for the number of tags). The data type should be float32 \ - or float64. - """ - self._transition = value - - def forward(self, input, length, label=None): - """ - Performs sequence tagging prediction. - - Parameters: - input (Variable): The input of unary potentials(emission). It is a - tensor with shape `[batch_size, sequence_length, num_tags]`. - The data type should be float32 or float64. - length (Variable): A tensor with shape `[batch_size]`. - It stores real length of each sequence for correctness. - label (Variable, optional): The golden sequence tags. It is a tensor - with shape `[batch_size, sequence_length]`. The data type - should be int64. Default None. - - Returns: - Variable: A tensor with shape `[batch_size, sequence_length]` and \ - int64 data type. If `label` is None, the tensor has binary values \ - indicating a correct or incorrect prediction. Otherwise its values \ - range from 0 to maximum tag number - 1, each element indicates \ - an index of a predicted tag. - """ - - viterbi_path = self._helper.create_variable_for_type_inference( - dtype=self._dtype) - this_inputs = { - "Emission": [input], - "Transition": self._transition, - "Label": label - } - if length is not None: - this_inputs['Length'] = [length] - self._helper.append_op( - type='crf_decoding', - inputs=this_inputs, - outputs={"ViterbiPath": [viterbi_path]}) - return viterbi_path - - -class _GRUEncoder(Layer): - """ - A multi-layer bidirectional GRU encoder used by SequenceTagging. - """ - - def __init__(self, - input_dim, - grnn_hidden_dim, - init_bound, - num_layers=1, - is_bidirection=False): - super(_GRUEncoder, self).__init__() - self.num_layers = num_layers - self.is_bidirection = is_bidirection - self.gru_list = [] - self.gru_r_list = [] - for i in range(num_layers): - self.basic_gru_cell = BasicGRUCell( - input_size=input_dim if i == 0 else input_dim * 2, - hidden_size=grnn_hidden_dim, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.UniformInitializer( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - self.gru_list.append( - self.add_sublayer( - "gru_%d" % i, - RNN(self.basic_gru_cell, is_reverse=False, - time_major=False))) - if self.is_bidirection: - for i in range(num_layers): - self.basic_gru_cell_r = BasicGRUCell( - input_size=input_dim if i == 0 else input_dim * 2, - hidden_size=grnn_hidden_dim, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.UniformInitializer( - low=-init_bound, high=init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - self.gru_r_list.append( - self.add_sublayer( - "gru_r_%d" % i, - RNN(self.basic_gru_cell_r, - is_reverse=True, - time_major=False))) - - def forward(self, input_feature, h0=None): - for i in range(self.num_layers): - pre_gru, pre_state = self.gru_list[i](input_feature) - if self.is_bidirection: - gru_r, r_state = self.gru_r_list[i](input_feature) - out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1) - else: - out = pre_gru - input_feature = out - return out - - -class SequenceTagging(Layer): - """ - Sequence tagging model using multi-layer bidirectional GRU as backbone and - linear chain CRF as output layer. - - Parameters: - vocab_size (int): The size of vocabulary. - num_labels (int): The number of labels. - word_emb_dim (int, optional): The embedding size. Defalut 128 - grnn_hidden_dim (int, optional): The hidden size of GRU. Defalut 128 - emb_learning_rate (int, optional): The partial learning rate for embedding. - The actual learning rate for embedding would multiply it with the global - learning rate. Default 0.1 - crf_learning_rate (int, optional): The partial learning rate for crf. The - actual learning rate for embedding would multiply it with the global - learning rate. Default 0.1 - bigru_num (int, optional): The number of bidirectional GRU layers. - Default 2 - init_bound (float, optional): The range for uniform initializer would - be `(-init_bound, init_bound)`. It would be used for all parameters - except CRF transition matrix. Default 0.1 - - Examples: - - .. code-block:: python - - import numpy as np - import paddle - import paddle.fluid as fluid - from paddle.text import SequenceTagging - - # word: [batch_size, sequence_length] - # dummy input just for example - word = paddle.ones((2, 8), dtype='int64') - length = fluid.layers.assign(np.array([6, 8]).astype('int64')) - seq_tagger = SequenceTagging(vocab_size=100, num_labels=5) - outputs = seq_tagger(word, length) - """ - - def __init__(self, - vocab_size, - num_labels, - word_emb_dim=128, - grnn_hidden_dim=128, - emb_learning_rate=0.1, - crf_learning_rate=0.1, - bigru_num=2, - init_bound=0.1): - super(SequenceTagging, self).__init__() - self.word_emb_dim = word_emb_dim - self.vocab_size = vocab_size - self.num_labels = num_labels - self.grnn_hidden_dim = grnn_hidden_dim - self.emb_lr = emb_learning_rate - self.crf_lr = crf_learning_rate - self.bigru_num = bigru_num - self.init_bound = 0.1 - - self.word_embedding = Embedding( - size=[self.vocab_size, self.word_emb_dim], - dtype='float32', - param_attr=fluid.ParamAttr( - learning_rate=self.emb_lr, - name="word_emb", - initializer=fluid.initializer.Uniform( - low=-self.init_bound, high=self.init_bound))) - - self.gru_encoder = _GRUEncoder( - input_dim=self.grnn_hidden_dim, - grnn_hidden_dim=self.grnn_hidden_dim, - init_bound=self.init_bound, - num_layers=self.bigru_num, - is_bidirection=True) - - self.fc = Linear( - input_dim=self.grnn_hidden_dim * 2, - output_dim=self.num_labels, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=-self.init_bound, high=self.init_bound), - regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) - - self.linear_chain_crf = LinearChainCRF( - param_attr=fluid.ParamAttr( - name='linear_chain_crfw', learning_rate=self.crf_lr), - size=self.num_labels) - - self.crf_decoding = CRFDecoding( - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=self.crf_lr), - size=self.num_labels) - - def forward(self, word, lengths, target=None): - """ - Performs sequence tagging. If `target` is None, it is for training and - loss would be returned, otherwise it is for inference and returns the - predicted tags. - - Parameters: - word (Variable): The input sequences to be labeled. It is a tensor - with shape `[batch_size, sequence_length]`. The data type should - be int64. - lengths (Variable): A tensor with shape `[batch_size]`. It stores real - length of each sequence. - target (Variable, optional): The golden sequence tags. It is a tensor - with shape `[batch_size, sequence_length]`. The data type - should be int64. It could be None for inference. Default None. - - Returns: - tuple: A tuple( :code:`(crf_decode, avg_cost, lengths)` ) If input \ - argument `target` is provided, including the most likely sequence \ - tags, the averaged CRF cost and the sequence lengths, the shapes \ - are `[batch_size, sequence_length]`, `[1]` and `[batch_size]`, \ - and the data types are int64, float32 and int64. Otherwise A \ - tuple( :code:`(crf_decode, lengths)` ) for inference. - """ - word_embed = self.word_embedding(word) - input_feature = word_embed - - bigru_output = self.gru_encoder(input_feature) - emission = self.fc(bigru_output) - - if target is not None: - crf_cost = self.linear_chain_crf( - input=emission, label=target, length=lengths) - avg_cost = fluid.layers.mean(x=crf_cost) - self.crf_decoding.weight = self.linear_chain_crf.weight - crf_decode = self.crf_decoding(input=emission, length=lengths) - return crf_decode, avg_cost, lengths - else: - self.linear_chain_crf.weight = self.crf_decoding.weight - crf_decode = self.crf_decoding(input=emission, length=lengths) - return crf_decode, lengths