From 799783e2249af7c7df91a6cf9e48bc457651ef64 Mon Sep 17 00:00:00 2001 From: Valentin Berkes Date: Wed, 17 Feb 2021 09:47:43 +0100 Subject: [PATCH 1/5] implement safe bpe dropout for LM + joiner disjoin dropout --- onmt/transforms/misc.py | 80 +++++++++++++++++++++++++++++++++++++ onmt/transforms/tokenize.py | 69 +++++++++++++++++++++++--------- 2 files changed, 130 insertions(+), 19 deletions(-) diff --git a/onmt/transforms/misc.py b/onmt/transforms/misc.py index 3885eb97f0..d5635b9945 100644 --- a/onmt/transforms/misc.py +++ b/onmt/transforms/misc.py @@ -1,6 +1,8 @@ from onmt.utils.logging import logger from onmt.transforms import register_transform from .transform import Transform +import random +from onmt.constants import ModelTask, SubwordMarker @register_transform(name='filtertoolong') @@ -41,6 +43,84 @@ def _repr_args(self): ) +@register_transform(name="joiner_dropout") +class JoinerDropoutTransform(Transform): + """Disjoin joiner with probability dropout""" + + def __init__(self, opts): + super().__init__(opts) + + @classmethod + def add_options(cls, parser): + """Avalilable options relate to this Transform.""" + group = parser.add_argument_group("Transform/JoinerDropout") + group.add( + "--src_joiner_dropout", + "-src_joiner_dropout", + type=float, + default=0.0, + help="Source dropout probability.", + ) + group.add( + "--tgt_joiner_dropout", + "-tgt_joiner_dropout", + type=float, + default=0.0, + help="Target dropout probability.", + ) + + def _parse_opts(self): + self.src_joiner_dropout = self.opts.src_joiner_dropout + self.tgt_joiner_dropout = self.opts.tgt_joiner_dropout + + def dropout_separate_joiner(self, seq, side="src"): + out_seq = [] + dropout = ( + self.src_joiner_dropout + if side == "src" + else self.tgt_joiner_dropout + ) + for elem in seq: + if elem == SubwordMarker.JOINER: + continue + if elem.startswith(SubwordMarker.JOINER): + if random.random() < dropout: + out_seq.append(SubwordMarker.JOINER) + elem = elem[1:] + if len(elem) > 1 and elem.endswith(SubwordMarker.JOINER): + if random.random() < dropout: + out_seq.append(elem[:-1]) + out_seq.append(SubwordMarker.JOINER) + else: + out_seq.append(elem) + else: + out_seq.append(elem) + return out_seq + + def apply(self, example, is_train=False, stats=None, **kwargs): + """Return None if too long else return as is.""" + if not is_train: + return example + else: + src_out = self.dropout_separate_joiner(example["src"], "src") + example["src"] = src_out + if self.opts.model_task == ModelTask.LANGUAGE_MODEL: + example["tgt"] = src_out + else: + tgt_out = self.dropout_separate_joiner(example["tgt"], "tgt") + example["tgt"] = tgt_out + return example + + def _repr_args(self): + """Return str represent key arguments for class.""" + return "{}={}, {}={}".format( + "src_joiner_dropout", + self.src_joiner_dropout, + "tgt_joiner_dropout", + self.tgt_joiner_dropout, + ) + + @register_transform(name='prefix') class PrefixTransform(Transform): """Add Prefix to src (& tgt) sentence.""" diff --git a/onmt/transforms/tokenize.py b/onmt/transforms/tokenize.py index 4f343e476c..c5ba0438ef 100644 --- a/onmt/transforms/tokenize.py +++ b/onmt/transforms/tokenize.py @@ -2,6 +2,7 @@ from onmt.utils.logging import logger from onmt.transforms import register_transform from .transform import Transform +from onmt.constants import ModelTask class TokenizerTransform(Transform): @@ -90,6 +91,7 @@ def _parse_opts(self): self.tgt_subword_vocab = self.opts.tgt_subword_vocab self.src_vocab_threshold = self.opts.src_vocab_threshold self.tgt_vocab_threshold = self.opts.tgt_vocab_threshold + self.model_task = self.opts.model_task def _repr_args(self): """Return str represent key arguments for TokenizerTransform.""" @@ -169,7 +171,10 @@ def _tokenize(self, tokens, side='src', is_train=False): def apply(self, example, is_train=False, stats=None, **kwargs): """Apply sentencepiece subword encode to src & tgt.""" src_out = self._tokenize(example['src'], 'src', is_train) - tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) + if self.model_task == ModelTask.LANGUAGE_MODEL: + tgt_out = src_out + else: + tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) if stats is not None: n_words = len(example['src']) + len(example['tgt']) n_subwords = len(src_out) + len(tgt_out) @@ -243,7 +248,10 @@ def _tokenize(self, tokens, side='src', is_train=False): def apply(self, example, is_train=False, stats=None, **kwargs): """Apply bpe subword encode to src & tgt.""" src_out = self._tokenize(example['src'], 'src', is_train) - tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) + if self.model_task == ModelTask.LANGUAGE_MODEL: + tgt_out = src_out + else: + tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) if stats is not None: n_words = len(example['src']) + len(example['tgt']) n_subwords = len(src_out) + len(tgt_out) @@ -327,7 +335,7 @@ def get_specials(cls, opts): tgt_specials.update(_case_specials) return (set(), set()) - def _get_subword_kwargs(self, side='src'): + def _get_subword_kwargs(self, side='src', is_train=False): """Return a dict containing kwargs relate to `side` subwords.""" subword_type = self.tgt_subword_type if side == 'tgt' \ else self.src_subword_type @@ -340,7 +348,7 @@ def _get_subword_kwargs(self, side='src'): kwopts = dict() if subword_type == 'bpe': kwopts['bpe_model_path'] = subword_model - kwopts['bpe_dropout'] = subword_alpha + kwopts['bpe_dropout'] = subword_alpha if is_train else 0 elif subword_type == 'sentencepiece': kwopts['sp_model_path'] = subword_model kwopts['sp_nbest_size'] = subword_nbest @@ -360,42 +368,65 @@ def warm_up(self, vocabs=None): """Initialize Tokenizer models.""" super().warm_up(None) import pyonmttok - src_subword_kwargs = self._get_subword_kwargs(side='src') + + src_subword_kwargs = self._get_subword_kwargs( + side="src", is_train=True + ) + valid_src_subword_kwargs = self._get_subword_kwargs( + side="src", is_train=False + ) src_tokenizer = pyonmttok.Tokenizer( **src_subword_kwargs, **self.src_other_kwargs ) - tgt_subword_kwargs = self._get_subword_kwargs(side='tgt') - _diff_vocab = ( - src_subword_kwargs.get('vocabulary_path', '') != - tgt_subword_kwargs.get('vocabulary_path', '') or - src_subword_kwargs.get('vocabulary_threshold', 0) != - tgt_subword_kwargs.get('vocabulary_threshold', 0)) + valid_src_tokenizer = pyonmttok.Tokenizer( + **valid_src_subword_kwargs, **self.src_other_kwargs + ) + tgt_subword_kwargs = self._get_subword_kwargs( + side="tgt", is_train=True + ) + _diff_vocab = src_subword_kwargs.get( + "vocabulary_path", "" + ) != tgt_subword_kwargs.get( + "vocabulary_path", "" + ) or src_subword_kwargs.get( + "vocabulary_threshold", 0 + ) != tgt_subword_kwargs.get( + "vocabulary_threshold", 0 + ) if self.share_vocab and not _diff_vocab: self.load_models = { - 'src': src_tokenizer, - 'tgt': src_tokenizer + "src": {"train": src_tokenizer, "valid": valid_src_tokenizer}, + "tgt": {"train": src_tokenizer, "valid": valid_src_tokenizer}, } else: - tgt_subword_kwargs = self._get_subword_kwargs(side='tgt') tgt_tokenizer = pyonmttok.Tokenizer( **tgt_subword_kwargs, **self.tgt_other_kwargs ) + valid_tgt_subword_kwargs = self._get_subword_kwargs( + side="tgt", is_train=False + ) + valid_tgt_tokenizer = pyonmttok.Tokenizer( + **valid_tgt_subword_kwargs, **self.tgt_other_kwargs + ) self.load_models = { - 'src': src_tokenizer, - 'tgt': tgt_tokenizer + "src": {"train": src_tokenizer, "valid": valid_src_tokenizer}, + "tgt": {"train": tgt_tokenizer, "valid": valid_tgt_tokenizer}, } def _tokenize(self, tokens, side='src', is_train=False): """Do OpenNMT Tokenizer's tokenize.""" - tokenizer = self.load_models[side] + tokenizer = self.load_models[side]['train' if is_train else 'valid'] sentence = ' '.join(tokens) segmented, _ = tokenizer.tokenize(sentence) return segmented def apply(self, example, is_train=False, stats=None, **kwargs): """Apply OpenNMT Tokenizer to src & tgt.""" - src_out = self._tokenize(example['src'], 'src') - tgt_out = self._tokenize(example['tgt'], 'tgt') + src_out = self._tokenize(example['src'], 'src', is_train) + if self.model_task == ModelTask.LANGUAGE_MODEL: + tgt_out = src_out + else: + tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) if stats is not None: n_words = len(example['src']) + len(example['tgt']) n_subwords = len(src_out) + len(tgt_out) From 49659287128c62ae24736fe7b9ee326e902993cf Mon Sep 17 00:00:00 2001 From: Valentin Berkes Date: Wed, 17 Feb 2021 14:56:22 +0100 Subject: [PATCH 2/5] fix missing model_task in opts --- onmt/transforms/misc.py | 3 ++- onmt/transforms/tokenize.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/onmt/transforms/misc.py b/onmt/transforms/misc.py index d5635b9945..1cc61b5bbe 100644 --- a/onmt/transforms/misc.py +++ b/onmt/transforms/misc.py @@ -72,6 +72,7 @@ def add_options(cls, parser): def _parse_opts(self): self.src_joiner_dropout = self.opts.src_joiner_dropout self.tgt_joiner_dropout = self.opts.tgt_joiner_dropout + self.model_task = getattr(self.opts, "model_task", None) def dropout_separate_joiner(self, seq, side="src"): out_seq = [] @@ -104,7 +105,7 @@ def apply(self, example, is_train=False, stats=None, **kwargs): else: src_out = self.dropout_separate_joiner(example["src"], "src") example["src"] = src_out - if self.opts.model_task == ModelTask.LANGUAGE_MODEL: + if self.model_task == ModelTask.LANGUAGE_MODEL: example["tgt"] = src_out else: tgt_out = self.dropout_separate_joiner(example["tgt"], "tgt") diff --git a/onmt/transforms/tokenize.py b/onmt/transforms/tokenize.py index c5ba0438ef..e7fc03a7c5 100644 --- a/onmt/transforms/tokenize.py +++ b/onmt/transforms/tokenize.py @@ -91,7 +91,7 @@ def _parse_opts(self): self.tgt_subword_vocab = self.opts.tgt_subword_vocab self.src_vocab_threshold = self.opts.src_vocab_threshold self.tgt_vocab_threshold = self.opts.tgt_vocab_threshold - self.model_task = self.opts.model_task + self.model_task = getattr(self.opts, "model_task", None) def _repr_args(self): """Return str represent key arguments for TokenizerTransform.""" From 14cdcf40e8013b5329670d5c0c207b329681bf95 Mon Sep 17 00:00:00 2001 From: Valentin Berkes Date: Wed, 17 Feb 2021 15:33:11 +0100 Subject: [PATCH 3/5] disable sp sampling --- onmt/transforms/tokenize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/onmt/transforms/tokenize.py b/onmt/transforms/tokenize.py index e7fc03a7c5..9254a2c26d 100644 --- a/onmt/transforms/tokenize.py +++ b/onmt/transforms/tokenize.py @@ -346,9 +346,13 @@ def _get_subword_kwargs(self, side='src', is_train=False): subword_alpha = self.tgt_subword_alpha if side == 'tgt' \ else self.src_subword_alpha kwopts = dict() + if not is_train: + # disable random aspects during validation + subword_alpha = 0 + subword_nbest = 1 if subword_type == 'bpe': kwopts['bpe_model_path'] = subword_model - kwopts['bpe_dropout'] = subword_alpha if is_train else 0 + kwopts['bpe_dropout'] = subword_alpha elif subword_type == 'sentencepiece': kwopts['sp_model_path'] = subword_model kwopts['sp_nbest_size'] = subword_nbest From 98f8189d0b041edc9c948eb29e24731b679088ca Mon Sep 17 00:00:00 2001 From: Valentin Berkes Date: Wed, 17 Feb 2021 16:28:32 +0100 Subject: [PATCH 4/5] fix dropped element in JoinerDropout --- onmt/transforms/misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/onmt/transforms/misc.py b/onmt/transforms/misc.py index 1cc61b5bbe..13f073f183 100644 --- a/onmt/transforms/misc.py +++ b/onmt/transforms/misc.py @@ -83,6 +83,7 @@ def dropout_separate_joiner(self, seq, side="src"): ) for elem in seq: if elem == SubwordMarker.JOINER: + out_seq.append(elem) continue if elem.startswith(SubwordMarker.JOINER): if random.random() < dropout: From 003fe4a7a93d7993b94fbf0ef32bb64c4bb78354 Mon Sep 17 00:00:00 2001 From: Valentin Berkes Date: Fri, 19 Feb 2021 17:24:26 +0000 Subject: [PATCH 5/5] fix joiner dropout --- onmt/transforms/misc.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/onmt/transforms/misc.py b/onmt/transforms/misc.py index 13f073f183..875438255b 100644 --- a/onmt/transforms/misc.py +++ b/onmt/transforms/misc.py @@ -82,21 +82,16 @@ def dropout_separate_joiner(self, seq, side="src"): else self.tgt_joiner_dropout ) for elem in seq: - if elem == SubwordMarker.JOINER: - out_seq.append(elem) - continue - if elem.startswith(SubwordMarker.JOINER): + if len(elem) > 1 and elem.startswith(SubwordMarker.JOINER): if random.random() < dropout: out_seq.append(SubwordMarker.JOINER) elem = elem[1:] if len(elem) > 1 and elem.endswith(SubwordMarker.JOINER): if random.random() < dropout: out_seq.append(elem[:-1]) - out_seq.append(SubwordMarker.JOINER) - else: - out_seq.append(elem) - else: - out_seq.append(elem) + elem = elem[-1:] + out_seq.append(elem) + return out_seq def apply(self, example, is_train=False, stats=None, **kwargs):