Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Versions of problems with combined examples. Good for TPU training.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 176807931
  • Loading branch information
nshazeer authored and Ryan Sepassi committed Nov 29, 2017
1 parent b3cad0c commit 936db05
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 8 deletions.
65 changes: 65 additions & 0 deletions tensor2tensor/data_generators/generator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,3 +447,68 @@ def shuffle_dataset(filenames):
out_fname = fname.replace(UNSHUFFLED_SUFFIX, "")
write_records(records, out_fname)
tf.gfile.Remove(fname)


def combine_examples_no_inputs(examples, max_length):
"""Combine examples into longer examples.
Concatenate targets to form target sequences with length up to max_length.
Target sequences longer than max_length are chopped into multiple sequences.
Args:
examples: a generator returning feature dictionaries.
max_length: an integer.
Yields:
feature dictionaries.
"""
partial = []
for example in examples:
x = example["targets"]
if len(x) + len(partial) > max_length:
if partial:
yield {"inputs": [0], "targets": partial}
partial = []
if len(x) > max_length:
num_fragments = len(x) // max_length
for i in xrange(num_fragments):
yield {"inputs": [0], "targets": x[max_length * i:max_length * (i + 1)]}
partial = x[max_length * num_fragments:]
else:
partial += x
if partial:
yield {"inputs": [0], "targets": partial}


def combine_examples_with_inputs(examples, max_length):
"""Combine examples into longer examples.
We combine multiple examples by concatenating the inputs and concatenating
the targets. Sequences where the inputs or the targets are too long are
emitted as singletons (not chopped).
Args:
examples: a generator returning feature dictionaries.
max_length: an integer.
Yields:
feature dictionaries.
"""
partial_a = []
partial_b = []
for example in examples:
a = example["inputs"]
b = example["targets"]
if (len(a) + len(partial_a) > max_length or
len(b) + len(partial_b) > max_length):
if partial_a or partial_b:
yield {"inputs": partial_a, "targets": partial_b}
partial_a = []
partial_b = []
if len(a) > max_length or len(b) > max_length:
yield {"inputs": a, "targets": b}
else:
partial_a += a
partial_b += b
if partial_a or partial_b:
yield {"inputs": partial_a, "targets": partial_b}
41 changes: 36 additions & 5 deletions tensor2tensor/data_generators/lm1b.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,15 @@ def _maybe_download_corpus(tmp_dir):
corpus_tar.extractall(tmp_dir)


def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
def _get_or_build_subword_text_encoder(tmp_dir,
vocab_filepath,
target_size):
"""Builds a SubwordTextEncoder based on the corpus.
Args:
tmp_dir: directory containing dataset.
vocab_filepath: path to store (or load) vocab.
target_size: an optional integer.
Returns:
a SubwordTextEncoder.
Expand All @@ -137,8 +140,13 @@ def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath):
line_count += 1
if line_count >= max_lines:
break
ret = text_encoder.SubwordTextEncoder()
ret.build_from_token_counts(token_counts, min_count=5)
if target_size == 2 ** 15:
# legacy behavior
ret = text_encoder.SubwordTextEncoder()
ret.build_from_token_counts(token_counts, min_count=5)
else:
ret = text_encoder.SubwordTextEncoder.build_to_target_size(
target_size, token_counts, 1, 1000)
ret.store_to_file(vocab_filepath)
return ret

Expand Down Expand Up @@ -183,7 +191,7 @@ def targeted_vocab_size(self):

@property
def use_train_shards_for_dev(self):
return True
return False

def generator(self, data_dir, tmp_dir, is_training):
"""Generator for lm1b sentences.
Expand All @@ -204,7 +212,8 @@ def generator(self, data_dir, tmp_dir, is_training):
encoder = text_encoder.ByteTextEncoder()
else:
vocab_filepath = os.path.join(data_dir, self.vocab_file)
encoder = _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath)
encoder = _get_or_build_subword_text_encoder(
tmp_dir, vocab_filepath, self.targeted_vocab_size)
for filepath in files:
tf.logging.info("filepath = %s", filepath)
for line in tf.gfile.Open(filepath):
Expand All @@ -214,6 +223,28 @@ def generator(self, data_dir, tmp_dir, is_training):
yield {"inputs": [0], "targets": tokens}


@registry.register_problem
class LanguagemodelLm1b8kConcat512(LanguagemodelLm1b32k):
"""A language model on the 1B words corpus.
8k vocabualry.
Training/eval examples are concatenated to a maximum length of 512.
Happy TPU Training.
Ratio of dev tokens (including eos) to dev words (including eos)
207351 / 159658 = 1.29872; multiply ppx by this to compare results.
"""

@property
def targeted_vocab_size(self):
return 2**13 # 8192

@property
def combine_to_length(self):
return 512


@registry.register_problem
class LanguagemodelLm1bCharacters(LanguagemodelLm1b32k):
"""A language model on the 1B words corpus, character level."""
Expand Down
25 changes: 22 additions & 3 deletions tensor2tensor/data_generators/problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,22 @@ def generator(self, data_dir, tmp_dir, is_training):
"""
raise NotImplementedError()

def maybe_combine_examples(self, generator):
if self.combine_to_length:
if self.has_inputs:
return generator_utils.combine_examples_with_inputs(
generator, self.combine_to_length)
else:
return generator_utils.combine_examples_no_inputs(
generator, self.combine_to_length)
else:
return generator

@property
def combine_to_length(self):
"""An optional integer. Concatenate examples into bigger examples."""
return None

@property
def use_train_shards_for_dev(self):
"""If true, we only generate training data and hold out shards for dev."""
Expand Down Expand Up @@ -630,12 +646,15 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
if self.use_train_shards_for_dev:
all_paths = train_paths + dev_paths
generator_utils.generate_files(
self.generator(data_dir, tmp_dir, True), all_paths)
self.maybe_combine_examples(self.generator(data_dir, tmp_dir, True)),
all_paths)
generator_utils.shuffle_dataset(all_paths)
else:
generator_utils.generate_dataset_and_shuffle(
self.generator(data_dir, tmp_dir, True), train_paths,
self.generator(data_dir, tmp_dir, False), dev_paths)
self.maybe_combine_examples(self.generator(data_dir, tmp_dir, True)),
train_paths,
self.maybe_combine_examples(self.generator(data_dir, tmp_dir, False)),
dev_paths)

def feature_encoders(self, data_dir):
if self.is_character_level:
Expand Down
12 changes: 12 additions & 0 deletions tensor2tensor/data_generators/translate_ende.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,18 @@ def target_space_id(self):
return problem.SpaceID.DE_BPE_TOK


@registry.register_problem
class TranslateEndeWmtBpe32kConcat512(TranslateEndeWmtBpe32k):
"""Problem spec for WMT En-De translation, BPE version.
Training/eval examples are concatenated to a maximum length of 512.
"""

@property
def combine_to_length(self):
return 512


@registry.register_problem
class TranslateEndeWmt8k(translate.TranslateProblem):
"""Problem spec for WMT En-De translation."""
Expand Down

0 comments on commit 936db05

Please sign in to comment.