diff --git a/paddlenlp/transformers/xlnet/modeling.py b/paddlenlp/transformers/xlnet/modeling.py index 04fd17f3a502b0..752718230c2f7c 100644 --- a/paddlenlp/transformers/xlnet/modeling.py +++ b/paddlenlp/transformers/xlnet/modeling.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Modeling classes for XLNet model.""" +"""Modeling classes for XLNet model.""" import re import paddle @@ -22,8 +22,8 @@ from .. import PretrainedModel, register_base_model __all__ = [ - "XLNetModel", "XLNetPretrainedModel", + "XLNetModel", "XLNetForSequenceClassification", "XLNetForTokenClassification", ] @@ -468,10 +468,11 @@ def forward( class XLNetPretrainedModel(PretrainedModel): """ - An abstract class for pretrained XLNet models. It provides XLNet related ``model_config_file``, - ``resource_files_names``, ``pretrained_resource_files_map``, ``pretrained_init_configuration``, - ``base_model_prefix`` for downloading and loading pretrained models. - See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. + An abstract class for pretrained XLNet models. It provides XLNet related `model_config_file`, + `resource_files_names`, `pretrained_resource_files_map`, `pretrained_init_configuration` and + `base_model_prefix` for downloading and loading pretrained models. + + Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. """ model_config_file = "model_config.json" @@ -642,64 +643,80 @@ def _init_weights(self, layer): @register_base_model class XLNetModel(XLNetPretrainedModel): """ - The bare XLNet Model transformer outputting raw hidden-states without any specific head on top. + The bare XLNet Model outputting raw hidden-states without any specific head on top. This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`. - Check the superclass documentation for the generic methods and the library implements for all its model. + Refer to the superclass documentation for the generic methods. - This model is also a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Args: - vocab_size (`int`): - Vocabulary size of the XLNet model. Defines the number of different tokens that can - be represented by the `inputs_ids` passed when calling XLNetModel. - mem_len (`int` or `None`, optional): - The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous - forward pass won't be re-computed. Defaults to ``None``. - reuse_len (`int` or `None`, optional): - The number of tokens in the current batch to be cached and reused in the future. Defaults to ``None``. - d_model (`int`, optional): - Dimensionality of the encoder layers and the pooler layer. Defaults to ``768``. - same_length (`bool`, optional): - Whether or not to use the same attention length for each token. Defaults to ``False``. - attn_type (`str`, optional): - The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL. - Defaults to ``"bi"``. - bi_data (`bool`, optional): - Whether or not to use bidirectional input pipeline. Usually set to `True` during pretraining and - `False` during fine-tuning. Defaults to ``False``. - clamp_len (`int`, optional): - Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping. - Defaults to ``-1``. - n_layer (`int`, optional): - Number of hidden layers in the Transformer encoder. Defaults to ``12``. - dropout (`float`, optional): - The dropout probability for all fully connected layers in the embeddings and encoder. - Defaults to ``0.1``. - classifier_dropout (`float`, optional): - The dropout probability for all fully connected layers in the pooler. - Defaults to ``0.1``. - n_head (`int`, optional): - Number of attention heads for each attention layer in the Transformer encoder. - Defaults to ``12``. - d_head (`int`, optional): - Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - Defaults to ``64``. - layer_norm_eps (`float`, optional): - The epsilon used by the layer normalization layers. - Defaults to ``1e-12``. - d_inner (`int`, optional): - Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - Defaults to ``3072``. - ff_activation (`str`, optional): - The non-linear activation function in the feed-forward layer. - ``"gelu"``, ``"relu"``, ``"silu"`` and ``"gelu_new"`` are supported. - Defaults to ``"gelu"``. - initializer_range (`float`, optional): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - Defaults to ``0.02``. + vocab_size (int): + Vocabulary size of `inputs_ids` in `XLNetModel`. + Also is the vocab size of token embedding matrix. + mem_len (int or None, optional): + The number of tokens to cache. If not 0 or None, the last `mem_len` hidden states + in each layer will be cached into memory. Defaults to `None`. + reuse_len (int or None, optional): + The number of tokens in the current batch to be cached. If positive, then at most + `reuse_len` tokens can be cached in the current batch. Otherwise, there is + no limit to the number of tokens. Defaults to `None`. + + .. note:: + The difference between `mem_len` and `reuse_len` is that `mem_len` defines + **the total number** of tokens to cache while `reuse_len` defines the number of tokens + in **the current batch** to be cached. + d_model (int, optional): + Dimensionality of the embedding layers, encoder layers and pooler layer. + Defaults to 768. + same_length (bool, optional): + Whether or not to use the same attention length for each token. + Defaults to `False`. + attn_type (str, optional): + The attention type used in the attention layer. Set **"bi"** for ``XLNet``, + **"uni"** for ``Transformer-XL``. Defaults to **"bi"**. + bi_data (bool, optional): + Whether or not to use bidirectional input pipeline. Set to `True` during pretraining and + `False` during fine-tuning. Defaults to `False`. + clamp_len (int, optional): + Maximum relative distance supported. All relative distances larger than `clamp_len` will be clamped. + Setting this attribute to -1 means no clamping. Defaults to -1. + n_layer (int, optional): + The number of hidden layers in the encoder. Defaults to 12. + dropout (float, optional): + The dropout ratio for all fully connected layers in the embeddings and encoder. + Defaults to 0.1. + classifier_dropout (float, optional): + The dropout ratio for all fully connected layers in the pooler (classification head). + Defaults to 0.1. + n_head (int, optional): + Number of attention heads in each attention layer. + Defaults to 12. + d_head (int, optional): + Dimensionality of each attention head. Defaults to 64. + + .. note:: + `d_head` should be equal to `d_model` divided by `n_head`. + layer_norm_eps (float, optional): + The `epsilon` parameter used in :class:`paddle.nn.LayerNorm` for + initializing layer normalization layers. Defaults to 1e-12. + d_inner (int, optional): + Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors + to ff layers are firstly projected from `d_model` to `d_inner`, + and then projected back to `d_model`. Typically `d_inner` is larger than `d_model`. + Defaults to 3072. + ff_activation (str, optional): + The non-linear activation function in the feed-forward layers in the encoder. + Choose from the following supported activation functions: `["relu", "gelu", "tanh", + "sigmoid", "mish", "swish"]`. Defaults to `"gelu"`. + initializer_range (float, optional): + The standard deviation of the normal initializer. Defaults to 0.02. + + .. note:: + A normal_initializer initializes weight matrices as normal distributions. + See :meth:`XLNetPretrainedModel._init_weights()` for how weights are initialized in `XLNetModel`. """ def __init__( @@ -857,112 +874,123 @@ def forward( output_hidden_states=False, return_dict=False, ): r""" - The XLNetModel forward method, overrides the __call__() special method. + The XLNetModel forward method, overrides the `__call__()` special method. Args: - input_ids (`Tensor`): - Indices of input sequence tokens in the vocabulary. - It's data type should be int64 and it has a shape of [batch_size, sequence_length]. - token_type_ids (`Tensor`, optional): + input_ids (Tensor): + Indices of input sequence tokens in the vocabulary. They are + numerical representations of tokens that build the input sequence. + It's data type should be `int64` and has a shape of [batch_size, sequence_length]. + token_type_ids (Tensor, optional): Segment token indices to indicate first and second portions of the inputs. - Indices can either be 0 or 1: - - - 0 corresponds to a *sentence A* token, - - 1 corresponds to a *sentence B* token. - - It's data type should be `int64` and it has a shape of [batch_size, sequence_length]. - Defaults to ``None``, which means we don't add segment embeddings. - attention_mask (`Tensor`, optional): - Mask to avoid performing attention on padding token indices with values being either 0 or 1: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - It's data type should be `float32` and it has a shape of [batch_size, sequence_length]. - Defaults to ``None``. - mems (`List[Tensor]`, optional): - Contains pre-computed hidden-states. Can be used to speed up sequential decoding. - It's a list (has a length of n_layers) of Tensors (has a data type of `float32`). - `use_mems` has to be set to `True` to make use of `mems`. - Defaults to ``None``, and we don't use mems. - perm_mask (`Tensor`, optional): - Mask to indicate the attention pattern for each input token with values being either 0 or 1. - - - if ``perm_mask[k, i, j] = 0``, i attend to j in batch k; - - if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. - - Only used during pretraining (to define factorization order) or for sequential decoding (generation). - It's data type should be `float32` and it has a shape of [batch_size, sequence_length, sequence_length]. - Defaults to ``None``, and each token attends to all the others (full bidirectional attention). - target_mapping (`Tensor`, optional): + Indices can be either 0 or 1: + + - 0 corresponds to a **sentence A** token, + - 1 corresponds to a **sentence B** token. + + It's data type should be `int64` and has a shape of [batch_size, sequence_length]. + Defaults to None, which means no segment embeddings is added to token embeddings. + attention_mask (Tensor, optional): + Mask to indicate whether to perform attention on each input token or not. + The values should be either 0 or 1. The attention scores will be set + to **-infinity** for any positions in the mask that are **0**, and will be + **unchanged** for positions that are **1**. + + - **1** for tokens that are **not masked**, + - **0** for tokens that are **masked**. + + It's data type should be `float32` and has a shape of [batch_size, sequence_length]. + Defaults to `None`. + mems (List[Tensor], optional): + A list of length `n_layers` with each Tensor being a pre-computed hidden-state for each layer. + Each Tensor has a dtype `float32` and a shape of [batch_size, sequence_length, hidden_size]. + Defaults to None, and we don't use mems. + + .. note:: + `use_mems` has to be set to `True` in order to make use of `mems`. + perm_mask (Tensor, optional): + Mask to indicate the permutation pattern of the input sequence with values being either 0 or 1. + + - if ``perm_mask[k, i, j] = 0``, i **attend** to j in batch k; + - if ``perm_mask[k, i, j] = 1``, i **does not attend** to j in batch k. + + Only used during pretraining (to define factorization order) or + for sequential decoding (generation). It's data type should be `float32` and + has a shape of [batch_size, sequence_length, sequence_length]. + Defaults to `None`, then each token attends to all the other tokens (full bidirectional attention). + target_mapping (Tensor, optional): Mask to indicate the output tokens to use with values being either 0 or 1. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. + It's data type should be `float32` and has a shape of [batch_size, num_predict, sequence_length]. Only used during pretraining for partial prediction or for sequential decoding (generation). - It's data type should be `float32` and it has a shape of [batch_size, num_predict, sequence_length]. - Defaults to ``None``. - input_mask (`Tensor`, optional): - Mask to avoid performing attention on padding token indices. - Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding. - Mask values can either be 0 or 1: + Defaults to `None`. + input_mask (Tensor, optional): + Mask to avoid performing attention on padding token with values being either 0 or 1. + It's data type should be `float32` and it has a shape of [batch_size, sequence_length]. + This mask is negative of `attention_mask`: - 1 for tokens that are **masked**, - 0 for tokens that are **not masked**. - You can only uses one of `input_mask` and `attention_mask`. - It's data type should be `float32` and it has a shape of [batch_size, sequence_length]. - Defaults to ``None``. - head_mask (`Tensor`, optional): - Mask to nullify selected heads of the self-attention modules. - Mask values can either be 0 or 1: + You should use only one of `input_mask` and `attention_mask`. Defaults to `None`. + head_mask (Tensor, optional): + Mask to nullify selected heads of the self-attention layers with values being either 0 or 1. - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. It's data type should be `float32` and has a shape of [num_heads] or [num_layers, num_heads]. - Defaults to ``None``, which means we keep all heads. - inputs_embeds (`Tensor`, optional): + Defaults to `None`, which means we keep all heads. + inputs_embeds (Tensor, optional): An embedded representation tensor which is an alternative of `input_ids`. - You should only specify one of them to avoid contradiction. It's data type should be `float32` - and has a shape of [batch_size, sequence_length, hidden_size]. - Defaults to ``None``, which means we only specify `input_ids`. - use_mems_train (`bool`, optional): + You should specify only either one of them to avoid contradiction. + It's data type should be `float32` and has a shape of [batch_size, sequence_length, hidden_size]. + Defaults to `None`, which means we only specify `input_ids`. + use_mems_train (bool, optional): Whether or not to use recurrent memory mechanism during training. - Defaults to ``False`` and we don't use recurrent memory mechanism in training mode. - use_mems_eval (`bool`, optional): + Defaults to `False` and we don't use recurrent memory mechanism in training mode. + use_mems_eval (bool, optional): Whether or not to use recurrent memory mechanism during evaluation. - Defaults to ``False`` and we don't use recurrent memory mechanism in evaluation mode. - output_attentions (`bool`, optional): + Defaults to `False` and we don't use recurrent memory mechanism in evaluation mode. + output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers. - Defaults to ``False`` and we don't return the attentions tensors. - output_hidden_states (`bool`, optional): + Defaults to `False` and do not return the attentions tensors. + output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers. - Defaults to ``False`` and we don't return the hidden states. - return_dict (`bool`, optional): - Whether or not to format the output as a `dict`. - Defaults to ``False``, and the default output is a `tuple`. + Defaults to `False` and do not return the hidden states. + return_dict (bool, optional): + Whether or not to format the output as a dict. + If True, the output will be formatted as a dict, else returns a plain tuple. + Defaults to False. Returns: - A `tuple` or a `dict`: A tuple of shape (``output``, ``new_mems``, ``hidden_states``, ``attentions``) - or a dict of shape {"last_hidden_state": ``output``, "mems": ``new_mems``, - "hidden_states": ``hidden_states``, "attentions": ``attentions``}. - - With the fields: - - - output (`Tensor`): - Sequence of hidden-states at the last layer of the model. - It's data type should be float32 and has a shape of [batch_size, num_predict, hidden_size]. - ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, - then ``num_predict`` corresponds to ``sequence_length``. - - mems (`List[Tensor]`): - A Tensor list of length 'n_layers' containing pre-computed hidden-states. - - hidden_states (`List[Tensor]`, optional): - A Tensor list containing hidden-states of the model at the output of each layer plus - the initial embedding outputs. Each Tensor has a data type of `float32` and + tuple or dict: A tuple with items: (`output`, `new_mems`, `hidden_states`, `attentions`) + or a dict with key-value pairs: {"last_hidden_state": `output`, "mems": `new_mems`, + "hidden_states": `hidden_states`, "attentions": `attentions`}. + + With the corresponding fields: + + - `output` (Tensor): + Output of the final layer of the model. + It's a Tensor of dtype `float32` and has a shape of [batch_size, num_predict, hidden_size]. + + .. note:: + `num_predict` corresponds to `target_mapping.shape[1]`. + If `target_mapping` is `None`, then `num_predict` equals to `sequence_length`. + - `mems` (List[Tensor]): + A list of pre-computed hidden-states. The length of the list is `n_layers`. + Each element in the list is a Tensor with dtype `float32` and has a shape of + [batch_size, sequence_length, hidden_size]. + - `hidden_states` (List[Tensor], optional): + A list of Tensor containing hidden-states of the model at the output of each layer + plus the initial embedding outputs. Each Tensor has a data type of `float32` and has a shape of [batch_size, sequence_length, hidden_size]. - - attentions (`List[Tensor]`, optional): - A Tensor list containing attentions weights after the attention softmax, used to compute - the weighted average in the self-attention heads. Each Tensor (one for each layer) has a data type - of `float32` and has a shape of [batch_size, num_heads, sequence_length, sequence_length]. + Being returned when `output_hidden_states` is set to `True`. + - `attentions` (List[Tensor], optional): + A list of Tensor containing attentions weights of each hidden layer. + Each Tensor (one for each layer) has a data type of `float32` and + has a shape of [batch_size, num_heads, sequence_length, sequence_length]. + Being returned when `output_attentions` is set to `True`. Example: .. code-block:: @@ -1229,14 +1257,14 @@ def forward(self, features, **kwargs): class XLNetForSequenceClassification(XLNetPretrainedModel): """ - XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. - for GLUE tasks. + XLNet Model with a sequence classification/regression head on top + (a linear layer on top of the pooled output) e.g. for GLUE tasks. Args: xlnet (:class:`XLNetModel`): An instance of :class:`XLNetModel`. - num_classes (`int`, optional): - The number of classes. Defaults to ``2``. + num_classes (int, optional): + The number of classes. Defaults to 2. """ def __init__(self, xlnet, num_classes=2): @@ -1265,53 +1293,53 @@ def forward( output_hidden_states=False, return_dict=False, ): r""" - The XLNetForSequenceClassification forward method, overrides the __call__() special method. + The XLNetForSequenceClassification forward method, overrides the `__call__()` special method. Args: - input_ids (`Tensor`): + input_ids (Tensor): See :class:`XLNetModel`. - token_type_ids (`Tensor`, optional): + token_type_ids (Tensor, optional): See :class:`XLNetModel`. - attention_mask (`Tensor`, optional): + attention_mask (Tensor, optional): See :class:`XLNetModel`. - mems (`Tensor`, optional): + mems (Tensor, optional): See :class:`XLNetModel`. - perm_mask (`Tensor`, optional): + perm_mask (Tensor, optional): See :class:`XLNetModel`. - target_mapping (`Tensor`, optional): + target_mapping (Tensor, optional): See :class:`XLNetModel`. - input_mask (`Tensor`, optional): + input_mask (Tensor, optional): See :class:`XLNetModel`. - head_mask (`Tensor`, optional): + head_mask (Tensor, optional): See :class:`XLNetModel`. - inputs_embeds (`Tensor`, optional): + inputs_embeds (Tensor, optional): See :class:`XLNetModel`. - use_mems_train (`bool`, optional): + use_mems_train (bool, optional): See :class:`XLNetModel`. - use_mems_eval (`bool`, optional): + use_mems_eval (bool, optional): See :class:`XLNetModel`. - output_attentions (`bool`, optional): + output_attentions (bool, optional): See :class:`XLNetModel`. - output_hidden_states (`bool`, optional): + output_hidden_states (bool, optional): See :class:`XLNetModel`. - return_dict (`bool`, optional): + return_dict (bool, optional): See :class:`XLNetModel`. Returns: - A `tuple` or a `dict`: A tuple of shape (``output``, ``new_mems``, ``hidden_states``, ``attentions``) - or a dict of shape {"last_hidden_state": ``output``, "mems": ``new_mems``, - "hidden_states": ``hidden_states``, "attentions": ``attentions``}. + tuple or dict: A tuple with items: (`output`, `new_mems`, `hidden_states`, `attentions`) + or a dict with key-value pairs: {"last_hidden_state": `output`, "mems": `new_mems`, + "hidden_states": `hidden_states`, "attentions": `attentions`}. - With the fields: + With the corresponding fields: - output (`Tensor`): - Classification scores before SoftMax (also called logits). It's data type should be float32 + - `output` (Tensor): + Classification scores before SoftMax (also called logits). It's data type should be `float32` and has a shape of [batch_size, num_classes]. - mems (`List[Tensor]`): + - `mems` (List[Tensor]): See :class:`XLNetModel`. - hidden_states (`List[Tensor]`, optional): + - `hidden_states` (List[Tensor], optional): See :class:`XLNetModel`. - attentions (`List[Tensor]`, optional): + - `attentions` (List[Tensor], optional): See :class:`XLNetModel`. Example: @@ -1367,8 +1395,8 @@ class XLNetForTokenClassification(XLNetPretrainedModel): Args: xlnet (:class:`XLNetModel`): An instance of :class:`XLNetModel`. - num_classes (`int`, optional): - The number of classes. Defaults to ``2``. + num_classes (int, optional): + The number of classes. Defaults to 2. """ def __init__(self, xlnet, num_classes=2): @@ -1397,53 +1425,53 @@ def forward( output_hidden_states=False, return_dict=False, ): r""" - The XLNetForTokenClassification forward method, overrides the __call__() special method. + The XLNetForTokenClassification forward method, overrides the `__call__()` special method. Args: - input_ids (`Tensor`): + input_ids (Tensor): See :class:`XLNetModel`. - token_type_ids (`Tensor`, optional): + token_type_ids (Tensor, optional): See :class:`XLNetModel`. - attention_mask (`Tensor`, optional): + attention_mask (Tensor, optional): See :class:`XLNetModel`. - mems (`Tensor`, optional): + mems (Tensor, optional): See :class:`XLNetModel`. - perm_mask (`Tensor`, optional): + perm_mask (Tensor, optional): See :class:`XLNetModel`. - target_mapping (`Tensor`, optional): + target_mapping (Tensor, optional): See :class:`XLNetModel`. - input_mask (`Tensor`, optional): + input_mask (Tensor, optional): See :class:`XLNetModel`. - head_mask (`Tensor`, optional): + head_mask (Tensor, optional): See :class:`XLNetModel`. - inputs_embeds (`Tensor`, optional): + inputs_embeds (Tensor, optional): See :class:`XLNetModel`. - use_mems_train (`bool`, optional): + use_mems_train (bool, optional): See :class:`XLNetModel`. - use_mems_eval (`bool`, optional): + use_mems_eval (bool, optional): See :class:`XLNetModel`. - output_attentions (`bool`, optional): + output_attentions (bool, optional): See :class:`XLNetModel`. - output_hidden_states (`bool`, optional): + output_hidden_states (bool, optional): See :class:`XLNetModel`. - return_dict (`bool`, optional): + return_dict (bool, optional): See :class:`XLNetModel`. Returns: - A `tuple` or a `dict`: A tuple of shape (``output``, ``new_mems``, ``hidden_states``, ``attentions``) - or a dict of shape {"last_hidden_state": ``output``, "mems": ``new_mems``, - "hidden_states": ``hidden_states``, "attentions": ``attentions``}. + tuple or dict: A tuple with items: (`output`, `new_mems`, `hidden_states`, `attentions`) + or a dict with key-value pairs: {"last_hidden_state": `output`, "mems": `new_mems`, + "hidden_states": `hidden_states`, "attentions": `attentions`}. - With the fields: + With the corresponding fields: - - output (`Tensor`): - Classification scores before SoftMax (also called logits). It's data type should be float32 + - `output` (Tensor): + Classification scores before SoftMax (also called logits). It's data type should be `float32` and has a shape of [batch_size, sequence_length, num_classes]. - - mems (`List[Tensor]`): + - `mems` (List[Tensor]): See :class:`XLNetModel`. - - hidden_states (`List[Tensor]`, optional): + - `hidden_states` (List[Tensor], optional): See :class:`XLNetModel`. - - attentions (`List[Tensor]`, optional): + - `attentions` (List[Tensor], optional): See :class:`XLNetModel`. Example: diff --git a/paddlenlp/transformers/xlnet/tokenizer.py b/paddlenlp/transformers/xlnet/tokenizer.py index 03b357d7f73f84..756056f27abacf 100644 --- a/paddlenlp/transformers/xlnet/tokenizer.py +++ b/paddlenlp/transformers/xlnet/tokenizer.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Tokenization class for XLNet model.""" +"""Tokenization class for XLNet model.""" import os import unicodedata @@ -37,44 +37,54 @@ class XLNetTokenizer(PretrainedTokenizer): """ - Constructs an XLNet tokenizer. Based on `SentencePiece `__. + Constructs an XLNet tokenizer based on `SentencePiece `__. + + This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer` + which contains most of the main methods. For more information regarding those methods, + please refer to this superclass. Args: - vocab_file (`str`): - ``SentencePiece`` file (ends with .spm) that contains the vocabulary necessary - to instantiate a tokenizer. - do_lower_case (`bool`, optional): - Whether to lowercase the input when tokenizing. Defaults to ``False`` and - we do not lowercase the input. - remove_space (`bool`, optional): - Whether to strip the text when tokenizing. Defaults to ``True`` and - we remove excess spaces before and after the string. - keep_accents (`bool`, optional): - Whether to keep accents when tokenizing. - Defaults to ``False`` and we don't keep accents. - bos_token (`str`, optional): - The beginning of sequence token that was used during pretraining. Defaults to ``""``. - eos_token (`str`, optional): - The end of sequence token. Defaults to ``""``. - unk_token (`str`, optional): - The unknown token. A token that is not in the vocabulary is set to be unk_token - inorder to be converted to an ID. Defaults to ``""``. - sep_token (`str`, optional): - The separator token. Defaults to ``""``. - pad_token (`str`, optional): - The token used for padding. Defaults to ``""``. - cls_token (`str`, optional): - The classifier token which is used when doing sequence classification. - It is the last token of the sequence when built with special tokens. Defaults to ``""``. - mask_token (`str`, optional): - The token used for masking values. In the masked language modeling task, - this is the token used and which the model will try to predict. Defaults to ``""``. - additional_special_tokens (`List[str]`, optional): - Additional special tokens used by the tokenizer. Defaults to ``["", ""]``. + vocab_file (str): + The vocabulary file (ends with '.spm') required to instantiate + a `SentencePiece `__ tokenizer. + do_lower_case (bool, optional): + Whether or not to lowercase the input when tokenizing. Defaults to `False` and + **does not** lowercase the input. + remove_space (bool, optional): + Whether or not to strip the text when tokenizing. Defaults to `True` and + removes excess spaces before and after the string. + keep_accents (bool, optional): + Whether or not to keep accents when tokenizing. Defaults to `False` and **does not** keep accents. + bos_token (str, optional): + A special token representing the beginning of a sequence that was used during pretraining. + Defaults to `""`. + eos_token (str, optional): + A special token representing the end of a sequence that was used during pretraining. + Defaults to `""`. + unk_token (str, optional): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to `""`. + sep_token (str, optional): + A special token separating two different sentences in the same input. + Defaults to `""`. + pad_token (str, optional): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to `""`. + cls_token (str, optional): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to `""`. + mask_token (str, optional): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to `""`. + additional_special_tokens (List[str], optional): + A list of additional special tokens to be used by the tokenizer. + Defaults to `["", ""]`. Attributes: - sp_model (`SentencePieceProcessor`): - The ``SentencePiece`` processor that is used for every conversion (string, tokens and IDs). + sp_model (SentencePieceProcessor): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). """ resource_files_names = {"vocab_file": "spiece.model"} @@ -209,15 +219,15 @@ def _tokenize(self, text, sample=False): return new_pieces def tokenize(self, text): - """ - End-to-end tokenization for XLNet models. - - Args: - text (`str`): - The text to be tokenized. - Returns: - `List(str)`: A list of string representing converted tokens. - """ + # """ + # Converts a string to a list of tokens. + # + # Args: + # text (str): + # The text to be tokenized. + # Returns: + # List(str): A list of string representing converted tokens. + # """ return self._tokenize(text) def _convert_token_to_id(self, token): @@ -234,11 +244,11 @@ def convert_tokens_to_ids(self, tokens): using the vocabulary. Args: - tokens (`str` or `List[str]`): + tokens (str or List[str]): One or several token(s) to convert to token id(s). Returns: - `int` or `List[int]` or `tuple(int)`: The token id or list of token ids or tuple of token ids. + int or List[int] or tuple(int): The token id or list of token ids or tuple of token ids. """ if not isinstance(tokens, (list, tuple)): return self._convert_token_to_id(tokens) @@ -251,14 +261,14 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False): a sequence of tokens, using the vocabulary and added tokens. Args: - ids (`int` or `List[int]`): + ids (int or List[int]): The token id (or token ids) to be converted to token(s). - skip_special_tokens (`bool`, optional): + skip_special_tokens (bool, optional): Whether or not to remove special tokens in the decoding. - Defaults to ``False`` and we do not remove special tokens. + Defaults to `False` and we do not remove special tokens. Returns: - `str` or `List[str]`: The decoded token(s). + str or List[str]: The decoded token(s). """ if not isinstance(ids, (list, tuple)): return self._convert_id_to_token(ids) @@ -271,7 +281,7 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False): return tokens def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" + # Converts a sequence of tokens (strings for sub-words) in a single string. out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string @@ -284,12 +294,12 @@ def num_special_tokens_to_add(self, pair=False): Do not put this inside your training loop. Args: - pair (`bool`, optional): - Whether the sequence is a sequence pair or a single sequence. - Defaults to ``False`` and the input is a single sequence. + pair (bool, optional): + Whether the input is a sequence pair or a single sequence. + Defaults to `False` and the input is a single sequence. Returns: - `int`: Number of tokens added to sequences. + int: Number of tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] @@ -307,13 +317,13 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - pair of sequences: ``A B `` Args: - token_ids_0 (`List[int]`): + token_ids_0 (List[int]): List of IDs for the first sequence. - token_ids_1 (`List[int]`, optional): - Optional second list of IDs for sequence pairs. Defaults to ``None``. + token_ids_1 (List[int], optional): + Optional second list of IDs for the second sequenze. Defaults to `None`. Returns: - `List[int]`: List of input IDs with the appropriate special tokens. + List[int]: List of input IDs with the appropriate special tokens. """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -334,14 +344,14 @@ def build_offset_mapping_with_special_tokens(self, - pair of sequences: ``A (0,0) B (0,0) (0,0)`` Args: - offset_mapping_0 (`List[tuple]`): + offset_mapping_0 (List[tuple]): List of char offsets to which the special tokens will be added. - offset_mapping_1 (`List[tuple]`, optional): + offset_mapping_1 (List[tuple], optional): Optional second list of char offsets for offset mapping pairs. - Defaults to ``None``. + Defaults to `None`. Returns: - `List[tuple]`: List of char offsets with the appropriate offsets of special tokens. + List[tuple]: A list of char offsets with the appropriate offsets of special tokens. """ if offset_mapping_1 is None: return offset_mapping_0 + [(0, 0)] + [(0, 0)] @@ -356,20 +366,20 @@ def get_special_tokens_mask(self, already_has_special_tokens=False): """ Creates a special tokens mask from the input sequences. - This method is called when adding special tokens using the tokenizer ``encode`` method. + This method is called when adding special tokens using the tokenizer `encode` method. Args: - token_ids_0 (`List[int]`): - List of IDs for the first sequence. - token_ids_1 (`List[int]`, optional): - Optional second list of IDs for sequence pairs. - Defaults to ``None``. - already_has_special_tokens (`bool`, optional): - Whether or not the token list is already formatted with special tokens for the model. - Defaults to ``False``. + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of `inputs_ids` for the second sequence. + Defaults to `None`. + already_has_special_tokens (bool, optional): + Whether or not the token list already contains special tokens for the model. + Defaults to `False`. Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + List[int]: A list of integers which is either 0 or 1: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: @@ -391,26 +401,36 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ - Creates a mask from the input sequences. - An XLNet sequence pair mask has the following format: + Creates a token_type mask from the input sequences. + If `token_ids_1` is not `None`, then a sequence pair + token_type mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 | first sequence | second sequence | + Else if `token_ids_1` is `None`, then a single sequence + token_type mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 + | first sequence | + - 0 stands for the segment id of **first segment tokens**, - 1 stands for the segment id of **second segment tokens**, - 2 stands for the segment id of **cls_token**. Args: - token_ids_0 (`List[int]`): - List of IDs for the first sequence. - token_ids_1 (`List[int]`, optional): - Optional second list of IDs for the sequence pair. Defaults to ``None``. + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of `inputs_ids` for the second sequence. + Defaults to `None`. Returns: - `List[int]`: List of token type IDs according to the given sequence(s). + List[int]: List of token type IDs according to the given sequence(s). """ sep = [self.sep_token_id] cls_segment_id = [2] @@ -422,10 +442,11 @@ def create_token_type_ids_from_sequences(self, def save_resources(self, save_directory): """ - Saves tokenizer related resources to files under `save_directory`. + Saves `SentencePiece `__ file + (ends with '.spm') under `save_directory`. Args: - save_directory (`str`): + save_directory (str): Directory to save files into. """ for name, file_name in self.resource_files_names.items():