From f7f3346aa71fc4e681b3d41ac7ae77097c1c46c7 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 16 Jul 2017 01:04:47 +0200 Subject: [PATCH 1/3] Add data file urls for Macedonian-English --- tensor2tensor/data_generators/generator_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 890f92c2a..9e3e7db2f 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -244,6 +244,11 @@ def gunzip_file(gz_path, new_path): "http://www.statmt.org/wmt13/training-parallel-un.tgz", ["un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr"] ], + # Macedonian-English + [ + "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long + ["train.mk", "train.en"] + ], ] From f34ab16d89c33b0686662539b5776cde7756b037 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 16 Jul 2017 01:05:46 +0200 Subject: [PATCH 2/3] Add id for Macedonian tokens --- tensor2tensor/data_generators/problem.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 28f4dcb1b..992aa3410 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -67,6 +67,8 @@ class SpaceID(object): ICE_TOK = 18 # Icelandic parse tokens ICE_PARSE_TOK = 19 + # Macedonian tokens + MK_TOK = 20 class Problem(object): From 48997b50f203b7429df02433ec75f3daf0ce0ea6 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Sun, 16 Jul 2017 01:07:19 +0200 Subject: [PATCH 3/3] Register new problem: setimes_mken_tokens_32k for Macedonian to English translation (with SETimes corpus) --- tensor2tensor/data_generators/wmt.py | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 8edab8ba2..504336b52 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -81,6 +81,31 @@ def _default_wmt_feature_encoders(data_dir, target_vocab_size): "targets": subtokenizer, } +@registry.register_problem("setimes_mken_tokens_32k") +class SETimesMkEnTokens32k(problem.Problem): + """Problem spec for SETimes Mk-En translation.""" + + @property + def target_vocab_size(self): + return 2**15 # 32768 + + def feature_encoders(self, data_dir): + return _default_wmt_feature_encoders(data_dir, self.target_vocab_size) + + def generate_data(self, data_dir, tmp_dir): + generator_utils.generate_dataset_and_shuffle( + mken_wordpiece_token_generator(tmp_dir, True, self.target_vocab_size), + self.training_filepaths(data_dir, 100, shuffled=False), + mken_wordpiece_token_generator(tmp_dir, False, self.target_vocab_size), + self.dev_filepaths(data_dir, 1, shuffled=False)) + + def hparams(self, defaults, unused_model_hparams): + p = defaults + vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} + p.target_modality = (registry.Modalities.SYMBOL, vocab_size) + p.input_space_id = problem.SpaceID.MK_TOK + p.target_space_id = problem.SpaceID.EN_TOK # End-of-sentence marker. EOS = text_encoder.EOS_TOKEN @@ -295,6 +320,21 @@ def ende_bpe_token_generator(tmp_dir, train): ("dev/newsdev2017-zhen-src.zh", "dev/newsdev2017-zhen-ref.en") ]] +# For Macedonian-English the SETimes corpus +# from http://nlp.ffzg.hr/resources/corpora/setimes/ is used. +# The original dataset has 207,777 parallel sentences. +# For training the first 205,777 sentences are used. +_MKEN_TRAIN_DATASETS = [[ + "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long + ("train.mk", "train.en") +]] + +# For development 1000 parallel sentences are used. +_MKEN_TEST_DATASETS = [[ + "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.dev.tgz", # pylint: disable=line-too-long + ("dev.mk", "dev.en") +]] + def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" @@ -393,6 +433,19 @@ def enfr_character_generator(tmp_dir, train): return character_generator(data_path + ".lang1", data_path + ".lang2", character_vocab, EOS) +def mken_wordpiece_token_generator(tmp_dir, train, vocab_size): + """Wordpiece generator for the SETimes Mk-En dataset.""" + datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS + source_datasets = [[item[0], [item[1][0]]] for item in datasets] + target_datasets = [[item[0], [item[1][1]]] for item in datasets] + symbolizer_vocab = generator_utils.get_or_generate_vocab( + tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size, + source_datasets + target_datasets) + tag = "train" if train else "dev" + data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) + return token_generator(data_path + ".lang1", data_path + ".lang2", + symbolizer_vocab, EOS) + def parsing_character_generator(tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder()