diff --git a/conv_seq2seq/README.md b/conv_seq2seq/README.md new file mode 100644 index 0000000000..817c464a3a --- /dev/null +++ b/conv_seq2seq/README.md @@ -0,0 +1,50 @@ +# Convolutional Sequence to Sequence Learning +This model implements the work in the following paper: + +Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017 + +# Training a Model +- Modify the following script if needed and then run: + + ```bash + python train.py \ + --train_data_path ./data/train_data \ + --test_data_path ./data/test_data \ + --src_dict_path ./data/src_dict \ + --trg_dict_path ./data/trg_dict \ + --enc_blocks "[(256, 3)] * 5" \ + --dec_blocks "[(256, 3)] * 3" \ + --emb_size 256 \ + --pos_size 200 \ + --drop_rate 0.1 \ + --use_gpu False \ + --trainer_count 1 \ + --batch_size 32 \ + --num_passes 20 \ + >train.log 2>&1 + ``` + +# Inferring by a Trained Model +- Infer by a trained model by running: + + ```bash + python infer.py \ + --infer_data_path ./data/infer_data \ + --src_dict_path ./data/src_dict \ + --trg_dict_path ./data/trg_dict \ + --enc_blocks "[(256, 3)] * 5" \ + --dec_blocks "[(256, 3)] * 3" \ + --emb_size 256 \ + --pos_size 200 \ + --drop_rate 0.1 \ + --use_gpu False \ + --trainer_count 1 \ + --max_len 100 \ + --beam_size 1 \ + --model_path ./params.pass-0.tar.gz \ + 1>infer_result 2>infer.log + ``` + +# Notes + +Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later. diff --git a/conv_seq2seq/beamsearch.py b/conv_seq2seq/beamsearch.py new file mode 100644 index 0000000000..45656e809b --- /dev/null +++ b/conv_seq2seq/beamsearch.py @@ -0,0 +1,163 @@ +#coding=utf-8 + +import sys +import time +import numpy as np + + +class BeamSearch(object): + """ + Generate sequence by beam search + NOTE: this class only implements generating one sentence at a time. + """ + + def __init__(self, + inferer, + trg_dict, + pos_size, + padding_num, + beam_size=1, + max_len=100): + self.inferer = inferer + self.trg_dict = trg_dict + self.word_padding = trg_dict.__len__() + self.pos_size = pos_size + self.pos_padding = pos_size + self.padding_num = padding_num + self.win_len = padding_num + 1 + self.max_len = max_len + self.beam_size = beam_size + + def get_beam_input(self, pre_beam_list, infer_data): + """ + Get input for generation at the current iteration. + """ + beam_input = [] + + if len(pre_beam_list) == 0: + cur_trg = [self.word_padding + ] * self.padding_num + [self.trg_dict['']] + cur_trg_pos = [self.pos_padding] * self.padding_num + [0] + beam_input.append(infer_data + [cur_trg] + [cur_trg_pos]) + else: + for seq in pre_beam_list: + if len(seq) < self.win_len: + cur_trg = [self.word_padding] * ( + self.win_len - len(seq) - 1 + ) + [self.trg_dict['']] + seq + cur_trg_pos = [self.pos_padding] * ( + self.win_len - len(seq) - 1) + [0] + range(1, + len(seq) + 1) + else: + cur_trg = seq[-self.win_len:] + cur_trg_pos = range( + len(seq) + 1 - self.win_len, len(seq) + 1) + + beam_input.append(infer_data + [cur_trg] + [cur_trg_pos]) + return beam_input + + def get_prob(self, beam_input): + """ + Get the probabilities of all possible tokens. + """ + row_list = [j * self.win_len for j in range(len(beam_input))] + prob = self.inferer.infer(beam_input, field='value')[row_list, :] + return prob + + def get_candidate(self, pre_beam_list, pre_beam_score, prob): + """ + Get top beam_size tokens and their scores for each beam. + """ + if prob.ndim == 1: + candidate_id = prob.argsort()[-self.beam_size:][::-1] + candidate_log_prob = np.log(prob[candidate_id]) + else: + candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1] + candidate_log_prob = np.zeros_like(candidate_id).astype('float32') + for j in range(len(pre_beam_list)): + candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]]) + + if pre_beam_score.size > 0: + candidate_score = candidate_log_prob + pre_beam_score.reshape( + (pre_beam_score.size, 1)) + else: + candidate_score = candidate_log_prob + + return candidate_id, candidate_score + + def prune(self, candidate_id, candidate_score, pre_beam_list, + completed_seq_list, completed_seq_score, completed_seq_min_score): + """ + Pruning process of the beam search. During the process, beam_size most possible sequences + are selected for the beam in the next iteration. Besides, their scores and the minimum score + of the completed sequences are updated. + """ + candidate_id = candidate_id.flatten() + candidate_score = candidate_score.flatten() + + topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist() + topk_seq_idx = [idx / self.beam_size for idx in topk_idx] + + next_beam = [] + beam_score = [] + for j in range(len(topk_idx)): + if candidate_id[topk_idx[j]] == self.trg_dict['']: + if len( + completed_seq_list + ) < self.beam_size or completed_seq_min_score <= candidate_score[ + topk_idx[j]]: + completed_seq_list.append(pre_beam_list[topk_seq_idx[j]]) + completed_seq_score.append(candidate_score[topk_idx[j]]) + + if completed_seq_min_score is None or ( + completed_seq_min_score >= + candidate_score[topk_idx[j]] and + len(completed_seq_list) < self.beam_size): + completed_seq_min_score = candidate_score[topk_idx[j]] + else: + seq = pre_beam_list[topk_seq_idx[ + j]] + [candidate_id[topk_idx[j]]] + score = candidate_score[topk_idx[j]] + next_beam.append(seq) + beam_score.append(score) + + beam_score = np.array(beam_score) + return next_beam, beam_score, completed_seq_min_score + + def search_one_sample(self, infer_data): + """ + Beam search process for one sample. + """ + completed_seq_list = [] + completed_seq_score = [] + completed_seq_min_score = None + uncompleted_seq_list = [[]] + uncompleted_seq_score = np.zeros(0) + + for i in xrange(self.max_len): + beam_input = self.get_beam_input(uncompleted_seq_list, infer_data) + + prob = self.get_prob(beam_input) + + candidate_id, candidate_score = self.get_candidate( + uncompleted_seq_list, uncompleted_seq_score, prob) + + uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune( + candidate_id, candidate_score, uncompleted_seq_list, + completed_seq_list, completed_seq_score, + completed_seq_min_score) + + if len(uncompleted_seq_list) == 0: + break + if len(completed_seq_list) >= self.beam_size: + seq_max_score = uncompleted_seq_score.max() + if seq_max_score < completed_seq_min_score: + uncompleted_seq_list = [] + break + + final_seq_list = completed_seq_list + uncompleted_seq_list + final_score = np.concatenate( + (np.array(completed_seq_score), uncompleted_seq_score)) + max_id = final_score.argmax() + top_seq = final_seq_list[max_id] + return top_seq diff --git a/conv_seq2seq/infer.py b/conv_seq2seq/infer.py new file mode 100644 index 0000000000..eb46df5549 --- /dev/null +++ b/conv_seq2seq/infer.py @@ -0,0 +1,199 @@ +#coding=utf-8 + +import sys +import argparse +import distutils.util +import gzip + +import paddle.v2 as paddle +from model import conv_seq2seq +from beamsearch import BeamSearch +import reader + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle Convolutional Seq2Seq") + parser.add_argument( + '--infer_data_path', + type=str, + required=True, + help="Path of the dataset for inference") + parser.add_argument( + '--src_dict_path', + type=str, + required=True, + help='Path of the source dictionary') + parser.add_argument( + '--trg_dict_path', + type=str, + required=True, + help='path of the target dictionary') + parser.add_argument( + '--enc_blocks', type=str, help='Convolution blocks of the encoder') + parser.add_argument( + '--dec_blocks', type=str, help='Convolution blocks of the decoder') + parser.add_argument( + '--emb_size', + type=int, + default=512, + help='Dimension of word embedding. (default: %(default)s)') + parser.add_argument( + '--pos_size', + type=int, + default=200, + help='Total number of the position indexes. (default: %(default)s)') + parser.add_argument( + '--drop_rate', + type=float, + default=0., + help='Dropout rate. (default: %(default)s)') + parser.add_argument( + "--use_gpu", + default=False, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") + parser.add_argument( + "--trainer_count", + default=1, + type=int, + help="Trainer number. (default: %(default)s)") + parser.add_argument( + '--max_len', + type=int, + default=100, + help="The maximum length of the sentence to be generated. (default: %(default)s)" + ) + parser.add_argument( + "--beam_size", + default=1, + type=int, + help="The width of beam expasion. (default: %(default)s)") + parser.add_argument( + "--model_path", + type=str, + required=True, + help="The path of trained model. (default: %(default)s)") + return parser.parse_args() + + +def to_sentence(seq, dictionary): + raw_sentence = [dictionary[id] for id in seq] + sentence = " ".join(raw_sentence) + return sentence + + +def infer(infer_data_path, + src_dict_path, + trg_dict_path, + model_path, + enc_conv_blocks, + dec_conv_blocks, + emb_dim=512, + pos_size=200, + drop_rate=0., + max_len=100, + beam_size=1): + """ + Inference. + + :param infer_data_path: The path of the data for inference. + :type infer_data_path: str + :param src_dict_path: The path of the source dictionary. + :type src_dict_path: str + :param trg_dict_path: The path of the target dictionary. + :type trg_dict_path: str + :param model_path: The path of a trained model. + :type model_path: str + :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type enc_conv_blocks: list of tuple + :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type dec_conv_blocks: list of tuple + :param emb_dim: The dimension of the embedding vector. + :type emb_dim: int + :param pos_size: The total number of the position indexes, which means + the maximum value of the index is pos_size - 1. + :type pos_size: int + :param drop_rate: Dropout rate. + :type drop_rate: float + :param max_len: The maximum length of the sentence to be generated. + :type max_len: int + :param beam_size: The width of beam expansion. + :type beam_size: int + """ + # load dict + src_dict = reader.load_dict(src_dict_path) + trg_dict = reader.load_dict(trg_dict_path) + src_dict_size = src_dict.__len__() + trg_dict_size = trg_dict.__len__() + + prob = conv_seq2seq( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + pos_size=pos_size, + emb_dim=emb_dim, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + drop_rate=drop_rate, + is_infer=True) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) + + padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] + padding_num = reduce(lambda x, y: x + y, padding_list) + infer_reader = reader.data_reader( + data_file=infer_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num) + + inferer = paddle.inference.Inference( + output_layer=prob, parameters=parameters) + + searcher = BeamSearch( + inferer=inferer, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num, + max_len=max_len, + beam_size=beam_size) + + reverse_trg_dict = reader.get_reverse_dict(trg_dict) + for i, raw_data in enumerate(infer_reader()): + infer_data = [raw_data[0], raw_data[1]] + result = searcher.search_one_sample(infer_data) + sentence = to_sentence(result, reverse_trg_dict) + print sentence + sys.stdout.flush() + return + + +def main(): + args = parse_args() + enc_conv_blocks = eval(args.enc_blocks) + dec_conv_blocks = eval(args.dec_blocks) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + infer( + infer_data_path=args.infer_data_path, + src_dict_path=args.src_dict_path, + trg_dict_path=args.trg_dict_path, + model_path=args.model_path, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + emb_dim=args.emb_size, + pos_size=args.pos_size, + drop_rate=args.drop_rate, + max_len=args.max_len, + beam_size=args.beam_size) + + +if __name__ == '__main__': + main() diff --git a/conv_seq2seq/model.py b/conv_seq2seq/model.py new file mode 100644 index 0000000000..01dd94288b --- /dev/null +++ b/conv_seq2seq/model.py @@ -0,0 +1,417 @@ +#coding=utf-8 + +import math + +import paddle.v2 as paddle + +__all__ = ["conv_seq2seq"] + + +def gated_conv_with_batchnorm(input, + size, + context_len, + context_start=None, + learning_rate=1.0, + drop_rate=0.): + """ + Definition of the convolution block. + + :param input: The input of this block. + :type input: LayerOutput + :param size: The dimension of the block's output. + :type size: int + :param context_len: The context length of the convolution. + :type context_len: int + :param context_start: The start position of the context. + :type context_start: int + :param learning_rate: The learning rate factor of the parameters in the block. + The actual learning rate is the product of the global + learning rate and this factor. + :type learning_rate: float + :param drop_rate: Dropout rate. + :type drop_rate: float + :return: The output of the convolution block. + :rtype: LayerOutput + """ + input = paddle.layer.dropout(input=input, dropout_rate=drop_rate) + + context = paddle.layer.mixed( + size=input.size * context_len, + input=paddle.layer.context_projection( + input=input, context_len=context_len, context_start=context_start)) + + raw_conv = paddle.layer.fc( + input=context, + size=size * 2, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size), + learning_rate=learning_rate), + bias_attr=False) + + batch_norm_conv = paddle.layer.batch_norm( + input=raw_conv, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(learning_rate=learning_rate)) + + with paddle.layer.mixed(size=size) as conv: + conv += paddle.layer.identity_projection( + batch_norm_conv, size=size, offset=0) + + with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate: + gate += paddle.layer.identity_projection( + batch_norm_conv, size=size, offset=size) + + with paddle.layer.mixed(size=size) as gated_conv: + gated_conv += paddle.layer.dotmul_operator(conv, gate) + + return gated_conv + + +def encoder(token_emb, + pos_emb, + conv_blocks=[(256, 3)] * 5, + num_attention=3, + drop_rate=0.1): + """ + Definition of the encoder. + + :param token_emb: The embedding vector of the input token. + :type token_emb: LayerOutput + :param pos_emb: The embedding vector of the input token's position. + :type pos_emb: LayerOutput + :param conv_blocks: The scale list of the convolution blocks. Each element of + the list contains output dimension and context length of + the corresponding convolution block. + :type conv_blocks: list of tuple + :param num_attention: The total number of the attention modules used in the decoder. + :type num_attention: int + :param drop_rate: Dropout rate. + :type drop_rate: float + :return: The input token encoding. + :rtype: LayerOutput + """ + embedding = paddle.layer.addto( + input=[token_emb, pos_emb], + layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) + + proj_size = conv_blocks[0][0] + block_input = paddle.layer.fc( + input=embedding, + size=proj_size, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt((1.0 - drop_rate) / embedding.size), + learning_rate=1.0 / (2.0 * num_attention)), + bias_attr=True, ) + + for (size, context_len) in conv_blocks: + if block_input.size == size: + residual = block_input + else: + residual = paddle.layer.fc( + input=block_input, + size=size, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(learning_rate=1.0 / + (2.0 * num_attention)), + bias_attr=True) + + gated_conv = gated_conv_with_batchnorm( + input=block_input, + size=size, + context_len=context_len, + learning_rate=1.0 / (2.0 * num_attention), + drop_rate=drop_rate) + + with paddle.layer.mixed(size=size) as block_output: + block_output += paddle.layer.identity_projection(residual) + block_output += paddle.layer.identity_projection(gated_conv) + + # halve the variance of the sum + block_output = paddle.layer.slope_intercept( + input=block_output, slope=math.sqrt(0.5)) + + block_input = block_output + + emb_dim = embedding.size + encoded_vec = paddle.layer.fc( + input=block_output, + size=emb_dim, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)), + bias_attr=True) + + encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding]) + + # halve the variance of the sum + encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5)) + + return encoded_vec, encoded_sum + + +def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum): + """ + Definition of the attention. + + :param decoder_state: The hidden state of the decoder. + :type decoder_state: LayerOutput + :param cur_embedding: The embedding vector of the current token. + :type cur_embedding: LayerOutput + :param encoded_vec: The source token encoding. + :type encoded_vec: LayerOutput + :param encoded_sum: The sum of the source token's encoding and embedding. + :type encoded_sum: LayerOutput + :return: A context vector. + :rtype: LayerOutput + """ + residual = decoder_state + + state_size = decoder_state.size + emb_dim = cur_embedding.size + with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary: + state_summary += paddle.layer.full_matrix_projection(decoder_state) + state_summary += paddle.layer.identity_projection(cur_embedding) + + # halve the variance of the sum + state_summary = paddle.layer.slope_intercept( + input=state_summary, slope=math.sqrt(0.5)) + + expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec) + + m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec) + + attention_weight = paddle.layer.fc( + input=m, + size=1, + act=paddle.activation.SequenceSoftmax(), + bias_attr=False) + + scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum) + + attended = paddle.layer.pooling( + input=scaled, pooling_type=paddle.pooling.Sum()) + + attended_proj = paddle.layer.fc( + input=attended, + size=state_size, + act=paddle.activation.Linear(), + bias_attr=True) + + attention_result = paddle.layer.addto(input=[attended_proj, residual]) + + # halve the variance of the sum + attention_result = paddle.layer.slope_intercept( + input=attention_result, slope=math.sqrt(0.5)) + return attention_result + + +def decoder(token_emb, + pos_emb, + encoded_vec, + encoded_sum, + dict_size, + conv_blocks=[(256, 3)] * 3, + drop_rate=0.1): + """ + Definition of the decoder. + + :param token_emb: The embedding vector of the input token. + :type token_emb: LayerOutput + :param pos_emb: The embedding vector of the input token's position. + :type pos_emb: LayerOutput + :param encoded_vec: The source token encoding. + :type encoded_vec: LayerOutput + :param encoded_sum: The sum of the source token's encoding and embedding. + :type encoded_sum: LayerOutput + :param dict_size: The size of the target dictionary. + :type dict_size: int + :param conv_blocks: The scale list of the convolution blocks. Each element + of the list contains output dimension and context length + of the corresponding convolution block. + :type conv_blocks: list of tuple + :param drop_rate: Dropout rate. + :type drop_rate: float + :return: The probability of the predicted token. + :rtype: LayerOutput + """ + + def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum): + conditional = attention( + decoder_state=decoder_state, + cur_embedding=cur_embedding, + encoded_vec=encoded_vec, + encoded_sum=encoded_sum) + return conditional + + embedding = paddle.layer.addto( + input=[token_emb, pos_emb], + layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) + + proj_size = conv_blocks[0][0] + block_input = paddle.layer.fc( + input=embedding, + size=proj_size, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)), + bias_attr=True, ) + + for (size, context_len) in conv_blocks: + if block_input.size == size: + residual = block_input + else: + residual = paddle.layer.fc( + input=block_input, + size=size, + act=paddle.activation.Linear(), + bias_attr=True) + + decoder_state = gated_conv_with_batchnorm( + input=block_input, + size=size, + context_len=context_len, + context_start=0, + drop_rate=drop_rate) + + group_inputs = [ + decoder_state, + embedding, + paddle.layer.StaticInput(input=encoded_vec), + paddle.layer.StaticInput(input=encoded_sum), + ] + + conditional = paddle.layer.recurrent_group( + step=attention_step, input=group_inputs) + + block_output = paddle.layer.addto(input=[conditional, residual]) + + # halve the variance of the sum + block_output = paddle.layer.slope_intercept( + input=block_output, slope=math.sqrt(0.5)) + + block_input = block_output + + out_emb_dim = embedding.size + block_output = paddle.layer.fc( + input=block_output, + size=out_emb_dim, + act=paddle.activation.Linear(), + layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) + + decoder_out = paddle.layer.fc( + input=block_output, + size=dict_size, + act=paddle.activation.Softmax(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)), + bias_attr=True) + + return decoder_out + + +def conv_seq2seq(src_dict_size, + trg_dict_size, + pos_size, + emb_dim, + enc_conv_blocks=[(256, 3)] * 5, + dec_conv_blocks=[(256, 3)] * 3, + drop_rate=0.1, + is_infer=False): + """ + Definition of convolutional sequence-to-sequence network. + + :param src_dict_size: The size of the source dictionary. + :type src_dict_size: int + :param trg_dict_size: The size of the target dictionary. + :type trg_dict_size: int + :param pos_size: The total number of the position indexes, which means + the maximum value of the index is pos_size - 1. + :type pos_size: int + :param emb_dim: The dimension of the embedding vector. + :type emb_dim: int + :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element + of the list contains output dimension and context length of the + corresponding convolution block. + :type enc_conv_blocks: list of tuple + :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element + of the list contains output dimension and context length of the + corresponding convolution block. + :type dec_conv_blocks: list of tuple + :param drop_rate: Dropout rate. + :type drop_rate: float + :param is_infer: Whether infer or not. + :type is_infer: bool + :return: Cost or output layer. + :rtype: LayerOutput + """ + src = paddle.layer.data( + name='src_word', + type=paddle.data_type.integer_value_sequence(src_dict_size)) + src_pos = paddle.layer.data( + name='src_word_pos', + type=paddle.data_type.integer_value_sequence(pos_size + + 1)) # one for padding + + src_emb = paddle.layer.embedding( + input=src, + size=emb_dim, + name='src_word_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + src_pos_emb = paddle.layer.embedding( + input=src_pos, + size=emb_dim, + name='src_pos_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + + num_attention = len(dec_conv_blocks) + encoded_vec, encoded_sum = encoder( + token_emb=src_emb, + pos_emb=src_pos_emb, + conv_blocks=enc_conv_blocks, + num_attention=num_attention, + drop_rate=drop_rate) + + trg = paddle.layer.data( + name='trg_word', + type=paddle.data_type.integer_value_sequence(trg_dict_size + + 1)) # one for padding + trg_pos = paddle.layer.data( + name='trg_word_pos', + type=paddle.data_type.integer_value_sequence(pos_size + + 1)) # one for padding + + trg_emb = paddle.layer.embedding( + input=trg, + size=emb_dim, + name='trg_word_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + trg_pos_emb = paddle.layer.embedding( + input=trg_pos, + size=emb_dim, + name='trg_pos_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + + decoder_out = decoder( + token_emb=trg_emb, + pos_emb=trg_pos_emb, + encoded_vec=encoded_vec, + encoded_sum=encoded_sum, + dict_size=trg_dict_size, + conv_blocks=dec_conv_blocks, + drop_rate=drop_rate) + + if is_infer: + return decoder_out + + trg_next_word = paddle.layer.data( + name='trg_next_word', + type=paddle.data_type.integer_value_sequence(trg_dict_size)) + cost = paddle.layer.classification_cost( + input=decoder_out, label=trg_next_word) + + return cost diff --git a/conv_seq2seq/reader.py b/conv_seq2seq/reader.py new file mode 100644 index 0000000000..6d4db49f2d --- /dev/null +++ b/conv_seq2seq/reader.py @@ -0,0 +1,67 @@ +#coding=utf-8 + +import random + + +def load_dict(dict_file): + word_dict = dict() + with open(dict_file, 'r') as f: + for i, line in enumerate(f): + w = line.strip().split()[0] + word_dict[w] = i + return word_dict + + +def get_reverse_dict(dictionary): + reverse_dict = {dictionary[k]: k for k in dictionary.keys()} + return reverse_dict + + +def load_data(data_file, src_dict, trg_dict): + UNK_IDX = src_dict[''] + with open(data_file, 'r') as f: + for line in f: + line_split = line.strip().split('\t') + if len(line_split) < 2: + continue + src, trg = line_split + src_words = src.strip().split() + trg_words = trg.strip().split() + src_seq = [src_dict.get(w, UNK_IDX) for w in src_words] + trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words] + yield src_seq, trg_seq + + +def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num): + def reader(): + UNK_IDX = src_dict[''] + word_padding = trg_dict.__len__() + pos_padding = pos_size + + def _get_pos(pos_list, pos_size, pos_padding): + return [pos if pos < pos_size else pos_padding for pos in pos_list] + + with open(data_file, 'r') as f: + for line in f: + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src, trg = line_split + src = src.strip().split() + src_word = [src_dict.get(w, UNK_IDX) for w in src] + src_word_pos = range(len(src_word)) + src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding) + + trg = trg.strip().split() + trg_word = [trg_dict[''] + ] + [trg_dict.get(w, UNK_IDX) for w in trg] + trg_word_pos = range(len(trg_word)) + trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding) + + trg_next_word = trg_word[1:] + [trg_dict['']] + trg_word = [word_padding] * padding_num + trg_word + trg_word_pos = [pos_padding] * padding_num + trg_word_pos + trg_next_word = trg_next_word + [trg_dict['']] * padding_num + yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word + + return reader diff --git a/conv_seq2seq/train.py b/conv_seq2seq/train.py new file mode 100644 index 0000000000..c6ce0dff12 --- /dev/null +++ b/conv_seq2seq/train.py @@ -0,0 +1,252 @@ +#coding=utf-8 + +import os +import sys +import time +import argparse +import distutils.util +import gzip +import numpy as np + +import paddle.v2 as paddle +from model import conv_seq2seq +import reader + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle Convolutional Seq2Seq") + parser.add_argument( + '--train_data_path', + type=str, + required=True, + help="Path of the training set") + parser.add_argument( + '--test_data_path', type=str, help='Path of the test set') + parser.add_argument( + '--src_dict_path', + type=str, + required=True, + help='Path of source dictionary') + parser.add_argument( + '--trg_dict_path', + type=str, + required=True, + help='Path of target dictionary') + parser.add_argument( + '--enc_blocks', type=str, help='Convolution blocks of the encoder') + parser.add_argument( + '--dec_blocks', type=str, help='Convolution blocks of the decoder') + parser.add_argument( + '--emb_size', + type=int, + default=512, + help='Dimension of word embedding. (default: %(default)s)') + parser.add_argument( + '--pos_size', + type=int, + default=200, + help='Total number of the position indexes. (default: %(default)s)') + parser.add_argument( + '--drop_rate', + type=float, + default=0., + help='Dropout rate. (default: %(default)s)') + parser.add_argument( + "--use_gpu", + default=False, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") + parser.add_argument( + "--trainer_count", + default=1, + type=int, + help="Trainer number. (default: %(default)s)") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help="Size of a mini-batch. (default: %(default)s)") + parser.add_argument( + '--num_passes', + type=int, + default=15, + help="Number of passes to train. (default: %(default)s)") + return parser.parse_args() + + +def create_reader(padding_num, + train_data_path, + test_data_path=None, + src_dict=None, + trg_dict=None, + pos_size=200, + batch_size=32): + + train_reader = paddle.batch( + reader=paddle.reader.shuffle( + reader=reader.data_reader( + data_file=train_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num), + buf_size=10240), + batch_size=batch_size) + + test_reader = None + if test_data_path: + test_reader = paddle.batch( + reader=paddle.reader.shuffle( + reader=reader.data_reader( + data_file=test_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num), + buf_size=10240), + batch_size=batch_size) + + return train_reader, test_reader + + +def train(train_data_path, + test_data_path, + src_dict_path, + trg_dict_path, + enc_conv_blocks, + dec_conv_blocks, + emb_dim=512, + pos_size=200, + drop_rate=0., + batch_size=32, + num_passes=15): + """ + Train the convolution sequence-to-sequence model. + + :param train_data_path: The path of the training set. + :type train_data_path: str + :param test_data_path: The path of the test set. + :type test_data_path: str + :param src_dict_path: The path of the source dictionary. + :type src_dict_path: str + :param trg_dict_path: The path of the target dictionary. + :type trg_dict_path: str + :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type enc_conv_blocks: list of tuple + :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type dec_conv_blocks: list of tuple + :param emb_dim: The dimension of the embedding vector. + :type emb_dim: int + :param pos_size: The total number of the position indexes, which means + the maximum value of the index is pos_size - 1. + :type pos_size: int + :param drop_rate: Dropout rate. + :type drop_rate: float + :param batch_size: The size of a mini-batch. + :type batch_size: int + :param num_passes: The total number of the passes to train. + :type num_passes: int + """ + # load dict + src_dict = reader.load_dict(src_dict_path) + trg_dict = reader.load_dict(trg_dict_path) + src_dict_size = src_dict.__len__() + trg_dict_size = trg_dict.__len__() + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, ) + + cost = conv_seq2seq( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + pos_size=pos_size, + emb_dim=emb_dim, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + drop_rate=drop_rate, + is_infer=False) + + # create parameters and trainer + parameters = paddle.parameters.create(cost) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] + padding_num = reduce(lambda x, y: x + y, padding_list) + train_reader, test_reader = create_reader( + padding_num=padding_num, + train_data_path=train_data_path, + test_data_path=test_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + batch_size=batch_size) + + feeding = { + 'src_word': 0, + 'src_word_pos': 1, + 'trg_word': 2, + 'trg_word_pos': 3, + 'trg_next_word': 4 + } + + # create event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 20 == 0: + cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) + print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % ( + cur_time, event.pass_id, event.batch_id, event.cost, + event.metrics) + else: + sys.stdout.flush() + + if isinstance(event, paddle.event.EndPass): + if test_reader is not None: + cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) + result = trainer.test(reader=test_reader, feeding=feeding) + print "[%s]: Pass: %d, TestCost: %f, %s" % ( + cur_time, event.pass_id, result.cost, result.metrics) + sys.stdout.flush() + with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id, + 'w') as f: + trainer.save_parameter_to_tar(f) + + if not os.path.exists('output'): + os.mkdir('output') + + trainer.train( + reader=train_reader, + event_handler=event_handler, + num_passes=num_passes, + feeding=feeding) + + +def main(): + args = parse_args() + enc_conv_blocks = eval(args.enc_blocks) + dec_conv_blocks = eval(args.dec_blocks) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + train( + train_data_path=args.train_data_path, + test_data_path=args.test_data_path, + src_dict_path=args.src_dict_path, + trg_dict_path=args.trg_dict_path, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + emb_dim=args.emb_size, + pos_size=args.pos_size, + drop_rate=args.drop_rate, + batch_size=args.batch_size, + num_passes=args.num_passes) + + +if __name__ == '__main__': + main() diff --git a/conv_seq_to_seq/README.md b/conv_seq_to_seq/README.md new file mode 100644 index 0000000000..817c464a3a --- /dev/null +++ b/conv_seq_to_seq/README.md @@ -0,0 +1,50 @@ +# Convolutional Sequence to Sequence Learning +This model implements the work in the following paper: + +Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017 + +# Training a Model +- Modify the following script if needed and then run: + + ```bash + python train.py \ + --train_data_path ./data/train_data \ + --test_data_path ./data/test_data \ + --src_dict_path ./data/src_dict \ + --trg_dict_path ./data/trg_dict \ + --enc_blocks "[(256, 3)] * 5" \ + --dec_blocks "[(256, 3)] * 3" \ + --emb_size 256 \ + --pos_size 200 \ + --drop_rate 0.1 \ + --use_gpu False \ + --trainer_count 1 \ + --batch_size 32 \ + --num_passes 20 \ + >train.log 2>&1 + ``` + +# Inferring by a Trained Model +- Infer by a trained model by running: + + ```bash + python infer.py \ + --infer_data_path ./data/infer_data \ + --src_dict_path ./data/src_dict \ + --trg_dict_path ./data/trg_dict \ + --enc_blocks "[(256, 3)] * 5" \ + --dec_blocks "[(256, 3)] * 3" \ + --emb_size 256 \ + --pos_size 200 \ + --drop_rate 0.1 \ + --use_gpu False \ + --trainer_count 1 \ + --max_len 100 \ + --beam_size 1 \ + --model_path ./params.pass-0.tar.gz \ + 1>infer_result 2>infer.log + ``` + +# Notes + +Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later. diff --git a/conv_seq_to_seq/beamsearch.py b/conv_seq_to_seq/beamsearch.py new file mode 100644 index 0000000000..45656e809b --- /dev/null +++ b/conv_seq_to_seq/beamsearch.py @@ -0,0 +1,163 @@ +#coding=utf-8 + +import sys +import time +import numpy as np + + +class BeamSearch(object): + """ + Generate sequence by beam search + NOTE: this class only implements generating one sentence at a time. + """ + + def __init__(self, + inferer, + trg_dict, + pos_size, + padding_num, + beam_size=1, + max_len=100): + self.inferer = inferer + self.trg_dict = trg_dict + self.word_padding = trg_dict.__len__() + self.pos_size = pos_size + self.pos_padding = pos_size + self.padding_num = padding_num + self.win_len = padding_num + 1 + self.max_len = max_len + self.beam_size = beam_size + + def get_beam_input(self, pre_beam_list, infer_data): + """ + Get input for generation at the current iteration. + """ + beam_input = [] + + if len(pre_beam_list) == 0: + cur_trg = [self.word_padding + ] * self.padding_num + [self.trg_dict['']] + cur_trg_pos = [self.pos_padding] * self.padding_num + [0] + beam_input.append(infer_data + [cur_trg] + [cur_trg_pos]) + else: + for seq in pre_beam_list: + if len(seq) < self.win_len: + cur_trg = [self.word_padding] * ( + self.win_len - len(seq) - 1 + ) + [self.trg_dict['']] + seq + cur_trg_pos = [self.pos_padding] * ( + self.win_len - len(seq) - 1) + [0] + range(1, + len(seq) + 1) + else: + cur_trg = seq[-self.win_len:] + cur_trg_pos = range( + len(seq) + 1 - self.win_len, len(seq) + 1) + + beam_input.append(infer_data + [cur_trg] + [cur_trg_pos]) + return beam_input + + def get_prob(self, beam_input): + """ + Get the probabilities of all possible tokens. + """ + row_list = [j * self.win_len for j in range(len(beam_input))] + prob = self.inferer.infer(beam_input, field='value')[row_list, :] + return prob + + def get_candidate(self, pre_beam_list, pre_beam_score, prob): + """ + Get top beam_size tokens and their scores for each beam. + """ + if prob.ndim == 1: + candidate_id = prob.argsort()[-self.beam_size:][::-1] + candidate_log_prob = np.log(prob[candidate_id]) + else: + candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1] + candidate_log_prob = np.zeros_like(candidate_id).astype('float32') + for j in range(len(pre_beam_list)): + candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]]) + + if pre_beam_score.size > 0: + candidate_score = candidate_log_prob + pre_beam_score.reshape( + (pre_beam_score.size, 1)) + else: + candidate_score = candidate_log_prob + + return candidate_id, candidate_score + + def prune(self, candidate_id, candidate_score, pre_beam_list, + completed_seq_list, completed_seq_score, completed_seq_min_score): + """ + Pruning process of the beam search. During the process, beam_size most possible sequences + are selected for the beam in the next iteration. Besides, their scores and the minimum score + of the completed sequences are updated. + """ + candidate_id = candidate_id.flatten() + candidate_score = candidate_score.flatten() + + topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist() + topk_seq_idx = [idx / self.beam_size for idx in topk_idx] + + next_beam = [] + beam_score = [] + for j in range(len(topk_idx)): + if candidate_id[topk_idx[j]] == self.trg_dict['']: + if len( + completed_seq_list + ) < self.beam_size or completed_seq_min_score <= candidate_score[ + topk_idx[j]]: + completed_seq_list.append(pre_beam_list[topk_seq_idx[j]]) + completed_seq_score.append(candidate_score[topk_idx[j]]) + + if completed_seq_min_score is None or ( + completed_seq_min_score >= + candidate_score[topk_idx[j]] and + len(completed_seq_list) < self.beam_size): + completed_seq_min_score = candidate_score[topk_idx[j]] + else: + seq = pre_beam_list[topk_seq_idx[ + j]] + [candidate_id[topk_idx[j]]] + score = candidate_score[topk_idx[j]] + next_beam.append(seq) + beam_score.append(score) + + beam_score = np.array(beam_score) + return next_beam, beam_score, completed_seq_min_score + + def search_one_sample(self, infer_data): + """ + Beam search process for one sample. + """ + completed_seq_list = [] + completed_seq_score = [] + completed_seq_min_score = None + uncompleted_seq_list = [[]] + uncompleted_seq_score = np.zeros(0) + + for i in xrange(self.max_len): + beam_input = self.get_beam_input(uncompleted_seq_list, infer_data) + + prob = self.get_prob(beam_input) + + candidate_id, candidate_score = self.get_candidate( + uncompleted_seq_list, uncompleted_seq_score, prob) + + uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune( + candidate_id, candidate_score, uncompleted_seq_list, + completed_seq_list, completed_seq_score, + completed_seq_min_score) + + if len(uncompleted_seq_list) == 0: + break + if len(completed_seq_list) >= self.beam_size: + seq_max_score = uncompleted_seq_score.max() + if seq_max_score < completed_seq_min_score: + uncompleted_seq_list = [] + break + + final_seq_list = completed_seq_list + uncompleted_seq_list + final_score = np.concatenate( + (np.array(completed_seq_score), uncompleted_seq_score)) + max_id = final_score.argmax() + top_seq = final_seq_list[max_id] + return top_seq diff --git a/conv_seq_to_seq/infer.py b/conv_seq_to_seq/infer.py new file mode 100644 index 0000000000..eb46df5549 --- /dev/null +++ b/conv_seq_to_seq/infer.py @@ -0,0 +1,199 @@ +#coding=utf-8 + +import sys +import argparse +import distutils.util +import gzip + +import paddle.v2 as paddle +from model import conv_seq2seq +from beamsearch import BeamSearch +import reader + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle Convolutional Seq2Seq") + parser.add_argument( + '--infer_data_path', + type=str, + required=True, + help="Path of the dataset for inference") + parser.add_argument( + '--src_dict_path', + type=str, + required=True, + help='Path of the source dictionary') + parser.add_argument( + '--trg_dict_path', + type=str, + required=True, + help='path of the target dictionary') + parser.add_argument( + '--enc_blocks', type=str, help='Convolution blocks of the encoder') + parser.add_argument( + '--dec_blocks', type=str, help='Convolution blocks of the decoder') + parser.add_argument( + '--emb_size', + type=int, + default=512, + help='Dimension of word embedding. (default: %(default)s)') + parser.add_argument( + '--pos_size', + type=int, + default=200, + help='Total number of the position indexes. (default: %(default)s)') + parser.add_argument( + '--drop_rate', + type=float, + default=0., + help='Dropout rate. (default: %(default)s)') + parser.add_argument( + "--use_gpu", + default=False, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") + parser.add_argument( + "--trainer_count", + default=1, + type=int, + help="Trainer number. (default: %(default)s)") + parser.add_argument( + '--max_len', + type=int, + default=100, + help="The maximum length of the sentence to be generated. (default: %(default)s)" + ) + parser.add_argument( + "--beam_size", + default=1, + type=int, + help="The width of beam expasion. (default: %(default)s)") + parser.add_argument( + "--model_path", + type=str, + required=True, + help="The path of trained model. (default: %(default)s)") + return parser.parse_args() + + +def to_sentence(seq, dictionary): + raw_sentence = [dictionary[id] for id in seq] + sentence = " ".join(raw_sentence) + return sentence + + +def infer(infer_data_path, + src_dict_path, + trg_dict_path, + model_path, + enc_conv_blocks, + dec_conv_blocks, + emb_dim=512, + pos_size=200, + drop_rate=0., + max_len=100, + beam_size=1): + """ + Inference. + + :param infer_data_path: The path of the data for inference. + :type infer_data_path: str + :param src_dict_path: The path of the source dictionary. + :type src_dict_path: str + :param trg_dict_path: The path of the target dictionary. + :type trg_dict_path: str + :param model_path: The path of a trained model. + :type model_path: str + :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type enc_conv_blocks: list of tuple + :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type dec_conv_blocks: list of tuple + :param emb_dim: The dimension of the embedding vector. + :type emb_dim: int + :param pos_size: The total number of the position indexes, which means + the maximum value of the index is pos_size - 1. + :type pos_size: int + :param drop_rate: Dropout rate. + :type drop_rate: float + :param max_len: The maximum length of the sentence to be generated. + :type max_len: int + :param beam_size: The width of beam expansion. + :type beam_size: int + """ + # load dict + src_dict = reader.load_dict(src_dict_path) + trg_dict = reader.load_dict(trg_dict_path) + src_dict_size = src_dict.__len__() + trg_dict_size = trg_dict.__len__() + + prob = conv_seq2seq( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + pos_size=pos_size, + emb_dim=emb_dim, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + drop_rate=drop_rate, + is_infer=True) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) + + padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] + padding_num = reduce(lambda x, y: x + y, padding_list) + infer_reader = reader.data_reader( + data_file=infer_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num) + + inferer = paddle.inference.Inference( + output_layer=prob, parameters=parameters) + + searcher = BeamSearch( + inferer=inferer, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num, + max_len=max_len, + beam_size=beam_size) + + reverse_trg_dict = reader.get_reverse_dict(trg_dict) + for i, raw_data in enumerate(infer_reader()): + infer_data = [raw_data[0], raw_data[1]] + result = searcher.search_one_sample(infer_data) + sentence = to_sentence(result, reverse_trg_dict) + print sentence + sys.stdout.flush() + return + + +def main(): + args = parse_args() + enc_conv_blocks = eval(args.enc_blocks) + dec_conv_blocks = eval(args.dec_blocks) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + infer( + infer_data_path=args.infer_data_path, + src_dict_path=args.src_dict_path, + trg_dict_path=args.trg_dict_path, + model_path=args.model_path, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + emb_dim=args.emb_size, + pos_size=args.pos_size, + drop_rate=args.drop_rate, + max_len=args.max_len, + beam_size=args.beam_size) + + +if __name__ == '__main__': + main() diff --git a/conv_seq_to_seq/model.py b/conv_seq_to_seq/model.py new file mode 100644 index 0000000000..01dd94288b --- /dev/null +++ b/conv_seq_to_seq/model.py @@ -0,0 +1,417 @@ +#coding=utf-8 + +import math + +import paddle.v2 as paddle + +__all__ = ["conv_seq2seq"] + + +def gated_conv_with_batchnorm(input, + size, + context_len, + context_start=None, + learning_rate=1.0, + drop_rate=0.): + """ + Definition of the convolution block. + + :param input: The input of this block. + :type input: LayerOutput + :param size: The dimension of the block's output. + :type size: int + :param context_len: The context length of the convolution. + :type context_len: int + :param context_start: The start position of the context. + :type context_start: int + :param learning_rate: The learning rate factor of the parameters in the block. + The actual learning rate is the product of the global + learning rate and this factor. + :type learning_rate: float + :param drop_rate: Dropout rate. + :type drop_rate: float + :return: The output of the convolution block. + :rtype: LayerOutput + """ + input = paddle.layer.dropout(input=input, dropout_rate=drop_rate) + + context = paddle.layer.mixed( + size=input.size * context_len, + input=paddle.layer.context_projection( + input=input, context_len=context_len, context_start=context_start)) + + raw_conv = paddle.layer.fc( + input=context, + size=size * 2, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size), + learning_rate=learning_rate), + bias_attr=False) + + batch_norm_conv = paddle.layer.batch_norm( + input=raw_conv, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(learning_rate=learning_rate)) + + with paddle.layer.mixed(size=size) as conv: + conv += paddle.layer.identity_projection( + batch_norm_conv, size=size, offset=0) + + with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate: + gate += paddle.layer.identity_projection( + batch_norm_conv, size=size, offset=size) + + with paddle.layer.mixed(size=size) as gated_conv: + gated_conv += paddle.layer.dotmul_operator(conv, gate) + + return gated_conv + + +def encoder(token_emb, + pos_emb, + conv_blocks=[(256, 3)] * 5, + num_attention=3, + drop_rate=0.1): + """ + Definition of the encoder. + + :param token_emb: The embedding vector of the input token. + :type token_emb: LayerOutput + :param pos_emb: The embedding vector of the input token's position. + :type pos_emb: LayerOutput + :param conv_blocks: The scale list of the convolution blocks. Each element of + the list contains output dimension and context length of + the corresponding convolution block. + :type conv_blocks: list of tuple + :param num_attention: The total number of the attention modules used in the decoder. + :type num_attention: int + :param drop_rate: Dropout rate. + :type drop_rate: float + :return: The input token encoding. + :rtype: LayerOutput + """ + embedding = paddle.layer.addto( + input=[token_emb, pos_emb], + layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) + + proj_size = conv_blocks[0][0] + block_input = paddle.layer.fc( + input=embedding, + size=proj_size, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt((1.0 - drop_rate) / embedding.size), + learning_rate=1.0 / (2.0 * num_attention)), + bias_attr=True, ) + + for (size, context_len) in conv_blocks: + if block_input.size == size: + residual = block_input + else: + residual = paddle.layer.fc( + input=block_input, + size=size, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(learning_rate=1.0 / + (2.0 * num_attention)), + bias_attr=True) + + gated_conv = gated_conv_with_batchnorm( + input=block_input, + size=size, + context_len=context_len, + learning_rate=1.0 / (2.0 * num_attention), + drop_rate=drop_rate) + + with paddle.layer.mixed(size=size) as block_output: + block_output += paddle.layer.identity_projection(residual) + block_output += paddle.layer.identity_projection(gated_conv) + + # halve the variance of the sum + block_output = paddle.layer.slope_intercept( + input=block_output, slope=math.sqrt(0.5)) + + block_input = block_output + + emb_dim = embedding.size + encoded_vec = paddle.layer.fc( + input=block_output, + size=emb_dim, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)), + bias_attr=True) + + encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding]) + + # halve the variance of the sum + encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5)) + + return encoded_vec, encoded_sum + + +def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum): + """ + Definition of the attention. + + :param decoder_state: The hidden state of the decoder. + :type decoder_state: LayerOutput + :param cur_embedding: The embedding vector of the current token. + :type cur_embedding: LayerOutput + :param encoded_vec: The source token encoding. + :type encoded_vec: LayerOutput + :param encoded_sum: The sum of the source token's encoding and embedding. + :type encoded_sum: LayerOutput + :return: A context vector. + :rtype: LayerOutput + """ + residual = decoder_state + + state_size = decoder_state.size + emb_dim = cur_embedding.size + with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary: + state_summary += paddle.layer.full_matrix_projection(decoder_state) + state_summary += paddle.layer.identity_projection(cur_embedding) + + # halve the variance of the sum + state_summary = paddle.layer.slope_intercept( + input=state_summary, slope=math.sqrt(0.5)) + + expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec) + + m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec) + + attention_weight = paddle.layer.fc( + input=m, + size=1, + act=paddle.activation.SequenceSoftmax(), + bias_attr=False) + + scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum) + + attended = paddle.layer.pooling( + input=scaled, pooling_type=paddle.pooling.Sum()) + + attended_proj = paddle.layer.fc( + input=attended, + size=state_size, + act=paddle.activation.Linear(), + bias_attr=True) + + attention_result = paddle.layer.addto(input=[attended_proj, residual]) + + # halve the variance of the sum + attention_result = paddle.layer.slope_intercept( + input=attention_result, slope=math.sqrt(0.5)) + return attention_result + + +def decoder(token_emb, + pos_emb, + encoded_vec, + encoded_sum, + dict_size, + conv_blocks=[(256, 3)] * 3, + drop_rate=0.1): + """ + Definition of the decoder. + + :param token_emb: The embedding vector of the input token. + :type token_emb: LayerOutput + :param pos_emb: The embedding vector of the input token's position. + :type pos_emb: LayerOutput + :param encoded_vec: The source token encoding. + :type encoded_vec: LayerOutput + :param encoded_sum: The sum of the source token's encoding and embedding. + :type encoded_sum: LayerOutput + :param dict_size: The size of the target dictionary. + :type dict_size: int + :param conv_blocks: The scale list of the convolution blocks. Each element + of the list contains output dimension and context length + of the corresponding convolution block. + :type conv_blocks: list of tuple + :param drop_rate: Dropout rate. + :type drop_rate: float + :return: The probability of the predicted token. + :rtype: LayerOutput + """ + + def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum): + conditional = attention( + decoder_state=decoder_state, + cur_embedding=cur_embedding, + encoded_vec=encoded_vec, + encoded_sum=encoded_sum) + return conditional + + embedding = paddle.layer.addto( + input=[token_emb, pos_emb], + layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) + + proj_size = conv_blocks[0][0] + block_input = paddle.layer.fc( + input=embedding, + size=proj_size, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)), + bias_attr=True, ) + + for (size, context_len) in conv_blocks: + if block_input.size == size: + residual = block_input + else: + residual = paddle.layer.fc( + input=block_input, + size=size, + act=paddle.activation.Linear(), + bias_attr=True) + + decoder_state = gated_conv_with_batchnorm( + input=block_input, + size=size, + context_len=context_len, + context_start=0, + drop_rate=drop_rate) + + group_inputs = [ + decoder_state, + embedding, + paddle.layer.StaticInput(input=encoded_vec), + paddle.layer.StaticInput(input=encoded_sum), + ] + + conditional = paddle.layer.recurrent_group( + step=attention_step, input=group_inputs) + + block_output = paddle.layer.addto(input=[conditional, residual]) + + # halve the variance of the sum + block_output = paddle.layer.slope_intercept( + input=block_output, slope=math.sqrt(0.5)) + + block_input = block_output + + out_emb_dim = embedding.size + block_output = paddle.layer.fc( + input=block_output, + size=out_emb_dim, + act=paddle.activation.Linear(), + layer_attr=paddle.attr.Extra(drop_rate=drop_rate)) + + decoder_out = paddle.layer.fc( + input=block_output, + size=dict_size, + act=paddle.activation.Softmax(), + param_attr=paddle.attr.Param( + initial_mean=0., + initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)), + bias_attr=True) + + return decoder_out + + +def conv_seq2seq(src_dict_size, + trg_dict_size, + pos_size, + emb_dim, + enc_conv_blocks=[(256, 3)] * 5, + dec_conv_blocks=[(256, 3)] * 3, + drop_rate=0.1, + is_infer=False): + """ + Definition of convolutional sequence-to-sequence network. + + :param src_dict_size: The size of the source dictionary. + :type src_dict_size: int + :param trg_dict_size: The size of the target dictionary. + :type trg_dict_size: int + :param pos_size: The total number of the position indexes, which means + the maximum value of the index is pos_size - 1. + :type pos_size: int + :param emb_dim: The dimension of the embedding vector. + :type emb_dim: int + :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element + of the list contains output dimension and context length of the + corresponding convolution block. + :type enc_conv_blocks: list of tuple + :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element + of the list contains output dimension and context length of the + corresponding convolution block. + :type dec_conv_blocks: list of tuple + :param drop_rate: Dropout rate. + :type drop_rate: float + :param is_infer: Whether infer or not. + :type is_infer: bool + :return: Cost or output layer. + :rtype: LayerOutput + """ + src = paddle.layer.data( + name='src_word', + type=paddle.data_type.integer_value_sequence(src_dict_size)) + src_pos = paddle.layer.data( + name='src_word_pos', + type=paddle.data_type.integer_value_sequence(pos_size + + 1)) # one for padding + + src_emb = paddle.layer.embedding( + input=src, + size=emb_dim, + name='src_word_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + src_pos_emb = paddle.layer.embedding( + input=src_pos, + size=emb_dim, + name='src_pos_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + + num_attention = len(dec_conv_blocks) + encoded_vec, encoded_sum = encoder( + token_emb=src_emb, + pos_emb=src_pos_emb, + conv_blocks=enc_conv_blocks, + num_attention=num_attention, + drop_rate=drop_rate) + + trg = paddle.layer.data( + name='trg_word', + type=paddle.data_type.integer_value_sequence(trg_dict_size + + 1)) # one for padding + trg_pos = paddle.layer.data( + name='trg_word_pos', + type=paddle.data_type.integer_value_sequence(pos_size + + 1)) # one for padding + + trg_emb = paddle.layer.embedding( + input=trg, + size=emb_dim, + name='trg_word_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + trg_pos_emb = paddle.layer.embedding( + input=trg_pos, + size=emb_dim, + name='trg_pos_emb', + param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1)) + + decoder_out = decoder( + token_emb=trg_emb, + pos_emb=trg_pos_emb, + encoded_vec=encoded_vec, + encoded_sum=encoded_sum, + dict_size=trg_dict_size, + conv_blocks=dec_conv_blocks, + drop_rate=drop_rate) + + if is_infer: + return decoder_out + + trg_next_word = paddle.layer.data( + name='trg_next_word', + type=paddle.data_type.integer_value_sequence(trg_dict_size)) + cost = paddle.layer.classification_cost( + input=decoder_out, label=trg_next_word) + + return cost diff --git a/conv_seq_to_seq/reader.py b/conv_seq_to_seq/reader.py new file mode 100644 index 0000000000..6d4db49f2d --- /dev/null +++ b/conv_seq_to_seq/reader.py @@ -0,0 +1,67 @@ +#coding=utf-8 + +import random + + +def load_dict(dict_file): + word_dict = dict() + with open(dict_file, 'r') as f: + for i, line in enumerate(f): + w = line.strip().split()[0] + word_dict[w] = i + return word_dict + + +def get_reverse_dict(dictionary): + reverse_dict = {dictionary[k]: k for k in dictionary.keys()} + return reverse_dict + + +def load_data(data_file, src_dict, trg_dict): + UNK_IDX = src_dict[''] + with open(data_file, 'r') as f: + for line in f: + line_split = line.strip().split('\t') + if len(line_split) < 2: + continue + src, trg = line_split + src_words = src.strip().split() + trg_words = trg.strip().split() + src_seq = [src_dict.get(w, UNK_IDX) for w in src_words] + trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words] + yield src_seq, trg_seq + + +def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num): + def reader(): + UNK_IDX = src_dict[''] + word_padding = trg_dict.__len__() + pos_padding = pos_size + + def _get_pos(pos_list, pos_size, pos_padding): + return [pos if pos < pos_size else pos_padding for pos in pos_list] + + with open(data_file, 'r') as f: + for line in f: + line_split = line.strip().split('\t') + if len(line_split) != 2: + continue + src, trg = line_split + src = src.strip().split() + src_word = [src_dict.get(w, UNK_IDX) for w in src] + src_word_pos = range(len(src_word)) + src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding) + + trg = trg.strip().split() + trg_word = [trg_dict[''] + ] + [trg_dict.get(w, UNK_IDX) for w in trg] + trg_word_pos = range(len(trg_word)) + trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding) + + trg_next_word = trg_word[1:] + [trg_dict['']] + trg_word = [word_padding] * padding_num + trg_word + trg_word_pos = [pos_padding] * padding_num + trg_word_pos + trg_next_word = trg_next_word + [trg_dict['']] * padding_num + yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word + + return reader diff --git a/conv_seq_to_seq/train.py b/conv_seq_to_seq/train.py new file mode 100644 index 0000000000..c6ce0dff12 --- /dev/null +++ b/conv_seq_to_seq/train.py @@ -0,0 +1,252 @@ +#coding=utf-8 + +import os +import sys +import time +import argparse +import distutils.util +import gzip +import numpy as np + +import paddle.v2 as paddle +from model import conv_seq2seq +import reader + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle Convolutional Seq2Seq") + parser.add_argument( + '--train_data_path', + type=str, + required=True, + help="Path of the training set") + parser.add_argument( + '--test_data_path', type=str, help='Path of the test set') + parser.add_argument( + '--src_dict_path', + type=str, + required=True, + help='Path of source dictionary') + parser.add_argument( + '--trg_dict_path', + type=str, + required=True, + help='Path of target dictionary') + parser.add_argument( + '--enc_blocks', type=str, help='Convolution blocks of the encoder') + parser.add_argument( + '--dec_blocks', type=str, help='Convolution blocks of the decoder') + parser.add_argument( + '--emb_size', + type=int, + default=512, + help='Dimension of word embedding. (default: %(default)s)') + parser.add_argument( + '--pos_size', + type=int, + default=200, + help='Total number of the position indexes. (default: %(default)s)') + parser.add_argument( + '--drop_rate', + type=float, + default=0., + help='Dropout rate. (default: %(default)s)') + parser.add_argument( + "--use_gpu", + default=False, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") + parser.add_argument( + "--trainer_count", + default=1, + type=int, + help="Trainer number. (default: %(default)s)") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help="Size of a mini-batch. (default: %(default)s)") + parser.add_argument( + '--num_passes', + type=int, + default=15, + help="Number of passes to train. (default: %(default)s)") + return parser.parse_args() + + +def create_reader(padding_num, + train_data_path, + test_data_path=None, + src_dict=None, + trg_dict=None, + pos_size=200, + batch_size=32): + + train_reader = paddle.batch( + reader=paddle.reader.shuffle( + reader=reader.data_reader( + data_file=train_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num), + buf_size=10240), + batch_size=batch_size) + + test_reader = None + if test_data_path: + test_reader = paddle.batch( + reader=paddle.reader.shuffle( + reader=reader.data_reader( + data_file=test_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + padding_num=padding_num), + buf_size=10240), + batch_size=batch_size) + + return train_reader, test_reader + + +def train(train_data_path, + test_data_path, + src_dict_path, + trg_dict_path, + enc_conv_blocks, + dec_conv_blocks, + emb_dim=512, + pos_size=200, + drop_rate=0., + batch_size=32, + num_passes=15): + """ + Train the convolution sequence-to-sequence model. + + :param train_data_path: The path of the training set. + :type train_data_path: str + :param test_data_path: The path of the test set. + :type test_data_path: str + :param src_dict_path: The path of the source dictionary. + :type src_dict_path: str + :param trg_dict_path: The path of the target dictionary. + :type trg_dict_path: str + :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type enc_conv_blocks: list of tuple + :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of + the list contains output dimension and context length of the corresponding + convolution block. + :type dec_conv_blocks: list of tuple + :param emb_dim: The dimension of the embedding vector. + :type emb_dim: int + :param pos_size: The total number of the position indexes, which means + the maximum value of the index is pos_size - 1. + :type pos_size: int + :param drop_rate: Dropout rate. + :type drop_rate: float + :param batch_size: The size of a mini-batch. + :type batch_size: int + :param num_passes: The total number of the passes to train. + :type num_passes: int + """ + # load dict + src_dict = reader.load_dict(src_dict_path) + trg_dict = reader.load_dict(trg_dict_path) + src_dict_size = src_dict.__len__() + trg_dict_size = trg_dict.__len__() + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, ) + + cost = conv_seq2seq( + src_dict_size=src_dict_size, + trg_dict_size=trg_dict_size, + pos_size=pos_size, + emb_dim=emb_dim, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + drop_rate=drop_rate, + is_infer=False) + + # create parameters and trainer + parameters = paddle.parameters.create(cost) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] + padding_num = reduce(lambda x, y: x + y, padding_list) + train_reader, test_reader = create_reader( + padding_num=padding_num, + train_data_path=train_data_path, + test_data_path=test_data_path, + src_dict=src_dict, + trg_dict=trg_dict, + pos_size=pos_size, + batch_size=batch_size) + + feeding = { + 'src_word': 0, + 'src_word_pos': 1, + 'trg_word': 2, + 'trg_word_pos': 3, + 'trg_next_word': 4 + } + + # create event handler + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 20 == 0: + cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) + print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % ( + cur_time, event.pass_id, event.batch_id, event.cost, + event.metrics) + else: + sys.stdout.flush() + + if isinstance(event, paddle.event.EndPass): + if test_reader is not None: + cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) + result = trainer.test(reader=test_reader, feeding=feeding) + print "[%s]: Pass: %d, TestCost: %f, %s" % ( + cur_time, event.pass_id, result.cost, result.metrics) + sys.stdout.flush() + with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id, + 'w') as f: + trainer.save_parameter_to_tar(f) + + if not os.path.exists('output'): + os.mkdir('output') + + trainer.train( + reader=train_reader, + event_handler=event_handler, + num_passes=num_passes, + feeding=feeding) + + +def main(): + args = parse_args() + enc_conv_blocks = eval(args.enc_blocks) + dec_conv_blocks = eval(args.dec_blocks) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + train( + train_data_path=args.train_data_path, + test_data_path=args.test_data_path, + src_dict_path=args.src_dict_path, + trg_dict_path=args.trg_dict_path, + enc_conv_blocks=enc_conv_blocks, + dec_conv_blocks=dec_conv_blocks, + emb_dim=args.emb_size, + pos_size=args.pos_size, + drop_rate=args.drop_rate, + batch_size=args.batch_size, + num_passes=args.num_passes) + + +if __name__ == '__main__': + main()