-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_pipe.py
204 lines (187 loc) · 8.85 KB
/
data_pipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import model_def
import process_config
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import random
import tensorflow as tf
def make_example_generator(filepath, seqlen, batchsize, shuffle):
total_bytes = os.path.getsize(filepath)
num_batches = total_bytes // batchsize
def example_generator():
starts = range(-batchsize, 0)
for i in range(num_batches):
with open(filepath, 'rb') as f:
starts = tf.random.uniform([batchsize], minval=0, maxval=total_bytes - seqlen,
dtype=tf.dtypes.int32) if shuffle else [s + batchsize for s in starts]
sequences = []
for start in starts:
f.seek(start)
sequences.append(f.read(seqlen))
yield sequences
return example_generator
def file_to_dataset(filepath, config, maskinputs=True):
batchsize = config['batchsize']
seqlen = config['seqlen']
predict_ahead = config['predict_ahead']
xy_seqlen = seqlen + predict_ahead
example_generator = make_example_generator(filepath, xy_seqlen, batchsize, maskinputs)
lines = tf.data.Dataset.from_generator(example_generator, tf.string, tf.TensorShape([batchsize]))
lines = lines.unbatch()
lines = lines.map(lambda line: string_to_ids(line))
lines = lines.batch(batchsize)
lines = lines.map(lambda line: tf.reshape(line, [batchsize, xy_seqlen])) # explicitly set shape
lines = lines.map(lambda line: (line[:, :-predict_ahead], line[:, -predict_ahead:]))
if maskinputs:
# Randomly mask some of the input values
lines = lines.map(lambda x,y: (randomly_mask_sampled_maskprob(x, 0.05, 0.2), y))
for _ in range(seqlen // 32):
lines = lines.map(lambda x,y: (randomly_span_mask(x, 0, 8), y))
lines = lines.map(lambda x,y: ((x, normalize(x),
randomly_mask_sampled_maskprob(add_go_byte(y), 0.05, 0.2)), (normalize(y), y[:, :1])))
lines = lines.prefetch(4)
return lines
def collect_targets(line, config):
"""
Each compute block has a fixed context window and emits logits to predict one char into the
past, and one into the future. The targets for these logits are extracted from the sequence,
taking into account the context window for the given block.
"""
targets = []
context_window = config['seqlen']
for block in config['blocks']:
subseqlen = block['subseqlen_compressed']
backward_indices = list(range(0, context_window, subseqlen))
forward_indices = list(range(1 + subseqlen, 2 + context_window, subseqlen))
backward_targets = tf.gather(line, backward_indices, axis=1)
forward_targets = tf.gather(line, forward_indices, axis=1)
targets.append((backward_targets, forward_targets))
targets = tuple(targets)
return targets
def normalize(char_ids):
"""
Maps chars in the ASCII range:
- lowercases
"""
ascii_upper_A = 65
ascii_upper_Z = 90
ascii_lower_a = 97
difference = ascii_lower_a - ascii_upper_A
char_ids = tf.where(tf.logical_and(char_ids >= ascii_upper_A, char_ids <= ascii_upper_Z),
char_ids + difference, char_ids)
return char_ids
def normalize_targets(char_ids_tuple):
"""
Targets are organized in a tuple of sequences. Each sequence is normalized here, and the first
target of the last sequence is separated and NOT normalized.
"""
target_for_next_char = char_ids_tuple[-1][:, :, 0]
char_ids_tuple = [normalize(y) for y in char_ids_tuple]
return tuple(char_ids_tuple + [target_for_next_char])
def bag_of_chars(tensor):
"""
Takes a sequence of chars and converts it a one-hot bag of chars representation
"""
tensor = tf.one_hot(tensor, 256) # batch, seq, 256
tensor = tf.reduce_sum(tensor, axis=1)
return tf.where(tensor == 0, tensor, tf.ones_like(tensor))
def add_go_byte(tensor):
# Create a go byte that will be attached to the beginning of the last dim of tensor
go = tf.zeros(tensor.shape[:-1], tensor.dtype)
go = tf.expand_dims(go, axis=go.shape.rank)
tensor = tf.concat([go, tensor], axis=go.shape.rank - 1)
# Trim the end of the last dim of the tensor to keep the orginal shape
slice_start = [0] * tensor.shape.rank
slice_size = tensor.shape.as_list()
slice_size[-1] -= 1
return tf.slice(tensor, slice_start, slice_size)
def mask_first_char(tensor):
batchsize = tensor.shape[0]
go = tf.zeros([batchsize, 1], tensor.dtype)
return tf.concat([go, tensor[:, 1:]], axis=1)
def randomly_mask_sampled_maskprob(tensor, min_maskprob, max_maskprob):
"""
Randomly mask values in the tensor, where the masking rate is uniformly sampled from
[0, max_maskprob]. This is done independently for each batch item.
"""
# Sample a masking probabilty for each batch item
maskprob = tf.random.uniform([tensor.shape[0]], minval=min_maskprob, maxval=max_maskprob,
dtype=tf.dtypes.float32)
maskprob = tf.expand_dims(maskprob, axis=1)
maskprob = tf.tile(maskprob, [1, tensor.shape[1]])
# Create a mask
mask = tf.random.uniform(tensor.shape, minval=0, maxval=1, dtype=tf.dtypes.float32)
mask = tf.where(tf.less(mask, maskprob), tf.zeros_like(mask), tf.ones_like(mask))
return tensor * tf.cast(mask, tensor.dtype)
def randomly_sequence_mask(tensor):
"""
Mask the first N characters of the first P examples in the batch, where N is uniformly sampled
from range(0, len(example)) for each example, and P is a constant where 0 <= P <= batchsize.
"""
percent_masked = 0.2 # proportion of batch elements that will have a mask applied
batchsize, maxlen = tensor.shape
num_masked = tf.cast(percent_masked * tf.cast(batchsize, tf.float32), tf.int32)
lengths = tf.random.uniform(shape=[num_masked], minval=0, maxval=maxlen, dtype=tf.int32)
lengths = tf.concat([lengths, tf.zeros([batchsize - num_masked], dtype=lengths.dtype)], axis=0)
mask = tf.sequence_mask(lengths, maxlen=maxlen)
return tf.where(mask, tf.zeros_like(tensor), tensor)
def randomly_span_mask(tensor, mean_span_len, stdv_span_len):
""" Mask a random span of contiguous characters within each example """
batchsize, maxlen = tensor.shape
def span_mask(vector): # computes and applies mask to a single row of the batch
span_len = tf.random.normal(shape=[], mean=mean_span_len, stddev=stdv_span_len)
span_len = tf.cast(span_len, tf.int32)
span_len = tf.math.maximum(0, span_len)
span_len = tf.math.minimum(maxlen, span_len)
mask = tf.zeros([span_len], dtype=vector.dtype)
span_start = tf.random.uniform(shape=[], minval=0, maxval=tf.maximum(1, maxlen - span_len),
dtype=tf.int32)
left_pad = tf.ones([span_start], dtype=vector.dtype)
right_pad = tf.ones([maxlen - (span_start + span_len)], dtype=vector.dtype)
mask = tf.concat([left_pad, mask, right_pad], axis=0)
return vector * mask
percent_masked = 1.0 # proportion of batch elements that will have a mask applied
num_masked = tf.cast(percent_masked * tf.cast(batchsize, tf.float32), tf.int32)
unmasked = tensor[:-num_masked]
masked = tf.map_fn(span_mask, tensor[-num_masked:])
return tf.concat([unmasked, masked], axis=0)
def string_to_ids(tf_string):
result = tf.strings.bytes_split(tf_string, 'UTF-8')
# Decode raw bytes: data is preped to a fixed number of bytes per line so some valid utf-8
# characters may get split into invalid utf-8 bytes if they lie on the boundary.
result = tf.io.decode_raw(result, tf.uint8)
result = tf.cast(result, tf.int32)
result = tf.squeeze(result, axis=1)
return result
def ids_to_string(tensor):
result = tf.strings.unicode_encode(tensor, 'UTF-8', errors='ignore')
return result
def ids_to_python_string(tensor):
# Manually convert the ints to char bytes, then to a string. This avoids producing weird
# characters when a unicode sequence has been broken up.
result = tf.cast(tensor, tf.uint8).numpy()
result = [str(bytes(line), 'utf-8', 'replace') for line in result]
return result
if __name__ == '__main__':
config = process_config.load_config()
config['batchsize'] = 10
lines = file_to_dataset('./traindata.txt', config)
print(lines)
lines = iter(lines)
for batch in range(5):
print('\n\n\nbatch: {}'.format(batch))
example = next(lines)
inputs, targets = example
print(inputs)
print(targets)
inputs = [ids_to_python_string(x) for x in inputs]
targets = [ids_to_python_string(y) for y in targets]
for idx in range(config['batchsize']):
print('Inputs:')
for x in inputs:
print(x[idx].replace(chr(0), '_'))
print()
print('Targets:')
for y in targets:
print(y[idx])
print()
print('---------------------------------------')