Versions of problems with combined examples. Good for TPU training.

PiperOrigin-RevId: 176807931
tensorflow · Nov 29, 2017 · 936db05 · 936db05
1 parent b3cad0c
commit 936db05
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 8 deletions.
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
@@ -447,3 +447,68 @@ def shuffle_dataset(filenames):
     out_fname = fname.replace(UNSHUFFLED_SUFFIX, "")
     write_records(records, out_fname)
     tf.gfile.Remove(fname)
+
+
+def combine_examples_no_inputs(examples, max_length):
+  """Combine examples into longer examples.
+
+  Concatenate targets to form target sequences with length up to max_length.
+  Target sequences longer than max_length are chopped into multiple sequences.
+
+  Args:
+    examples: a generator returning feature dictionaries.
+    max_length: an integer.
+
+  Yields:
+    feature dictionaries.
+  """
+  partial = []
+  for example in examples:
+    x = example["targets"]
+    if len(x) + len(partial) > max_length:
+      if partial:
+        yield {"inputs": [0], "targets": partial}
+        partial = []
+    if len(x) > max_length:
+      num_fragments = len(x) // max_length
+      for i in xrange(num_fragments):
+        yield {"inputs": [0], "targets": x[max_length * i:max_length * (i + 1)]}
+      partial = x[max_length * num_fragments:]
+    else:
+      partial += x
+  if partial:
+    yield {"inputs": [0], "targets": partial}
+
+
+def combine_examples_with_inputs(examples, max_length):
+  """Combine examples into longer examples.
+
+  We combine multiple examples by concatenating the inputs and concatenating
+  the targets.  Sequences where the inputs or the targets are too long are
+  emitted as singletons (not chopped).
+
+  Args:
+    examples: a generator returning feature dictionaries.
+    max_length: an integer.
+
+  Yields:
+    feature dictionaries.
+  """
+  partial_a = []
+  partial_b = []
+  for example in examples:
+    a = example["inputs"]
+    b = example["targets"]
+    if (len(a) + len(partial_a) > max_length or
+        len(b) + len(partial_b) > max_length):
+      if partial_a or partial_b:
+        yield {"inputs": partial_a, "targets": partial_b}
+        partial_a = []
+        partial_b = []
+    if len(a) > max_length or len(b) > max_length:
+      yield {"inputs": a, "targets": b}
+    else:
+      partial_a += a
+      partial_b += b
+  if partial_a or partial_b:
+    yield {"inputs": partial_a, "targets": partial_b}
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
@@ -112,12 +112,15 @@ def _maybe_download_corpus(tmp_dir):
       corpus_tar.extractall(tmp_dir)
 
 
-def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
+def _get_or_build_subword_text_encoder(tmp_dir,
+                                       vocab_filepath,
+                                       target_size):
   """Builds a SubwordTextEncoder based on the corpus.
 
   Args:
     tmp_dir: directory containing dataset.
     vocab_filepath: path to store (or load) vocab.
+    target_size: an optional integer.
 
   Returns:
     a SubwordTextEncoder.
@@ -137,8 +140,13 @@ def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
     line_count += 1
     if line_count >= max_lines:
       break
-  ret = text_encoder.SubwordTextEncoder()
-  ret.build_from_token_counts(token_counts, min_count=5)
+  if target_size == 2 ** 15:
+    # legacy behavior
+    ret = text_encoder.SubwordTextEncoder()
+    ret.build_from_token_counts(token_counts, min_count=5)
+  else:
+    ret = text_encoder.SubwordTextEncoder.build_to_target_size(
+        target_size, token_counts, 1, 1000)
   ret.store_to_file(vocab_filepath)
   return ret
 
@@ -183,7 +191,7 @@ def targeted_vocab_size(self):
 
   @property
   def use_train_shards_for_dev(self):
-    return True
+    return False
 
   def generator(self, data_dir, tmp_dir, is_training):
     """Generator for lm1b sentences.
@@ -204,7 +212,8 @@ def generator(self, data_dir, tmp_dir, is_training):
       encoder = text_encoder.ByteTextEncoder()
     else:
       vocab_filepath = os.path.join(data_dir, self.vocab_file)
-      encoder = _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath)
+      encoder = _get_or_build_subword_text_encoder(
+          tmp_dir, vocab_filepath, self.targeted_vocab_size)
     for filepath in files:
       tf.logging.info("filepath = %s", filepath)
       for line in tf.gfile.Open(filepath):
@@ -214,6 +223,28 @@ def generator(self, data_dir, tmp_dir, is_training):
         yield {"inputs": [0], "targets": tokens}
 
 
+@registry.register_problem
+class LanguagemodelLm1b8kConcat512(LanguagemodelLm1b32k):
+  """A language model on the 1B words corpus.
+
+  8k vocabualry.
+  Training/eval examples are concatenated to a maximum length of 512.
+
+  Happy TPU Training.
+
+  Ratio of dev tokens (including eos) to dev words (including eos)
+  207351 / 159658 = 1.29872; multiply ppx by this to compare results.
+  """
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**13  # 8192
+
+  @property
+  def combine_to_length(self):
+    return 512
+
+
 @registry.register_problem
 class LanguagemodelLm1bCharacters(LanguagemodelLm1b32k):
   """A language model on the 1B words corpus, character level."""

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
@@ -585,6 +585,22 @@ def generator(self, data_dir, tmp_dir, is_training):
     """
     raise NotImplementedError()
 
+  def maybe_combine_examples(self, generator):
+    if self.combine_to_length:
+      if self.has_inputs:
+        return generator_utils.combine_examples_with_inputs(
+            generator, self.combine_to_length)
+      else:
+        return generator_utils.combine_examples_no_inputs(
+            generator, self.combine_to_length)
+    else:
+      return generator
+
+  @property
+  def combine_to_length(self):
+    """An optional integer. Concatenate examples into bigger examples."""
+    return None
+
   @property
   def use_train_shards_for_dev(self):
     """If true, we only generate training data and hold out shards for dev."""
@@ -630,12 +646,15 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     if self.use_train_shards_for_dev:
       all_paths = train_paths + dev_paths
       generator_utils.generate_files(
-          self.generator(data_dir, tmp_dir, True), all_paths)
+          self.maybe_combine_examples(self.generator(data_dir, tmp_dir, True)),
+          all_paths)
       generator_utils.shuffle_dataset(all_paths)
     else:
       generator_utils.generate_dataset_and_shuffle(
-          self.generator(data_dir, tmp_dir, True), train_paths,
-          self.generator(data_dir, tmp_dir, False), dev_paths)
+          self.maybe_combine_examples(self.generator(data_dir, tmp_dir, True)),
+          train_paths,
+          self.maybe_combine_examples(self.generator(data_dir, tmp_dir, False)),
+          dev_paths)
 
   def feature_encoders(self, data_dir):
     if self.is_character_level:

diff --git a/tensor2tensor/data_generators/translate_ende.py b/tensor2tensor/data_generators/translate_ende.py
@@ -114,6 +114,18 @@ def target_space_id(self):
     return problem.SpaceID.DE_BPE_TOK
 
 
+@registry.register_problem
+class TranslateEndeWmtBpe32kConcat512(TranslateEndeWmtBpe32k):
+  """Problem spec for WMT En-De translation, BPE version.
+
+  Training/eval examples are concatenated to a maximum length of 512.
+  """
+
+  @property
+  def combine_to_length(self):
+    return 512
+
+
 @registry.register_problem
 class TranslateEndeWmt8k(translate.TranslateProblem):
   """Problem spec for WMT En-De translation."""