This repository has been archived by the owner on Sep 7, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathexplanations.py
404 lines (324 loc) · 14.3 KB
/
explanations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Static descriptions of the reason why each linter was triggered."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pprint
import pydoc
import re
import linters
FMT_FN = 'format_warnings'
MAX_WIDTH = 80
MAX_LEN = 30
FLAG_PREAMBLE = 'Flagged features:'
FLAG_SAMPS_PREAMBLE = 'Flagged features (and sample values):'
LI = '* {}'
LI_SAMP = LI + ': {}'
COLSEP = ' | '
BORDER_SZ = len('| {} |'.format(' '))
PCT_FMT = '{:.3n}%'
ULL_WFMT = LI + ': {} had length {} but {} had {}.'
NFNR_PREAMBLE = ('A \'typical\' numeric feature in the dataset has mean {:.3n}'
' and std dev {:.5n} but')
NFNR_WFMT = LI + ' had {}'
NFNR_STATFMT = '{} = {:.5n}'
DD_PREAMBLE = 'Found {} exact duplicate examples, {} of which are shown below.'
DD_MAX_VAL_LEN = 20
DD_MAX_COLS = 30
USD_PREAMBLE = 'Flagged features and prevalence of uncommon sign(s):'
USD_WFMT = '{} occurred {} of the time'
TDD_PREAMBLE = 'Flagged features and outlying extrema:'
TDD_WFMT = '{} value of {:.3n}'
def formatter(cls):
def set_formatter(fn):
setattr(cls, FMT_FN, staticmethod(fn))
return fn
return set_formatter
def pformat(obj, max_len=MAX_LEN, quote=True):
"""Pretty prints an object."""
if hasattr(obj, '__len__') and len(obj) == 1:
obj = obj[0]
if isinstance(obj, (int, float)):
pstr = '{:.4n}'.format(obj)
else:
pstr = pprint.pformat(obj)
pstr = re.sub(r'u(["\'])', r'\1', pstr)
if not quote:
pstr = pstr.strip("'").strip('"')
pstr = pydoc.cram(pstr, max_len)
return pstr
def _format_warning_sample_pair(warning, sample, quote_vals=True):
wstr = pformat(warning, quote=False)
samp_vals = sample.strings or sample.nums
samp_str = ', '.join([pformat(v, quote=quote_vals) for v in samp_vals])
return LI_SAMP.format(wstr, samp_str if samp_vals else wstr)
@formatter(linters.LintDetector)
def _format_warnings(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""Generic string formatter for warnings and, optionally, samples.
Args:
result: a LintResult proto produced by this linter
suppress: a set of warnings to be suppressed for this linter
max_width: the maximum width of a line generated by this formatter
Returns:
A list of lines to be printed.
"""
if result.lint_samples:
lines = [FLAG_SAMPS_PREAMBLE]
for warning, samp in zip(result.warnings, result.lint_samples):
if warning in suppress:
continue
lines.append(_format_warning_sample_pair(warning, samp))
else:
wstrs = [LI.format(pformat(w)) for w in result.warnings]
lines = [FLAG_PREAMBLE, '\n'.join(wstrs)]
return lines
@formatter(linters.EnumDetector)
def _format_warnings_de(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for EnumDetector LintResults."""
lines = [FLAG_SAMPS_PREAMBLE]
for warning, samp in zip(result.warnings, result.lint_samples):
if warning in suppress:
continue
lines.append(_format_warning_sample_pair(warning, samp, False))
return lines
@formatter(linters.UncommonListLengthDetector)
def _format_warnings_llo(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for UncommonListLengthDetector LintResults."""
lines = [FLAG_PREAMBLE]
for warning, sample in zip(result.warnings, result.lint_samples):
if warning in suppress:
continue
n_egs = sample.nums[0]
bucket_pcts = [PCT_FMT.format(bucket.count / n_egs * 100)
for bucket in sample.stats]
bucket_bounds = [' to '.join(map('{:.0f}'.format,
sorted({bucket.min, bucket.max})))
for bucket in sample.stats]
pct_bounds = map(list, zip(bucket_pcts, bucket_bounds))
lines.append(ULL_WFMT.format(pformat(warning, quote=False),
*sum(pct_bounds, [])))
return lines
@formatter(linters.EmptyExampleDetector)
def _format_warnings_dee(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for EmptyExampleDetector LintResults."""
return ['Found {} empty examples.'.format(result.warnings[0])]
@formatter(linters.NonNormalNumericFeatureDetector)
def _format_warnings_nfnr(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for NonNormalNumericFeatureDetector LintResults."""
ds_stats = result.lint_samples[0].stats[0]
lines = [NFNR_PREAMBLE.format(ds_stats.mean, ds_stats.std_dev)]
for warning, sample in zip(result.warnings, result.lint_samples[1:]):
warned_feature, warning_stats = warning.split(':')
warned_stats = warning_stats.split(',')
if warned_feature in suppress:
continue
stats = sample.stats[0]
stats_warnings = [
NFNR_STATFMT.format(warned_stat, getattr(stats, warned_stat))
for warned_stat in warned_stats]
lines.append(NFNR_WFMT.format(pformat(warned_feature, quote=False),
', '.join(stats_warnings)))
return lines
@formatter(linters.DuplicateExampleDetector)
def _format_warnings_dd(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for DuplicateExampleDetector LintResults."""
egs = result.lint_samples[0].examples
lines = [DD_PREAMBLE.format(result.warnings[0], len(egs))]
cols = sorted(set(f for eg in egs for f in eg.features.feature))[:DD_MAX_COLS]
col_vals = {col: [] for col in cols}
col_widths = {col: min(len(col), DD_MAX_VAL_LEN - BORDER_SZ) for col in cols}
for eg in egs:
for col in cols:
feature = eg.features.feature.get(col)
if not feature:
col_vals[col].append('')
continue
kind = feature.WhichOneof('kind')
if kind is None:
col_vals[col].append('')
continue
vals = getattr(feature, kind).value
if not vals:
col_vals[col].append('')
continue
val_str = pformat(vals, max_len=DD_MAX_VAL_LEN)
col_vals[col].append(val_str)
col_widths[col] = max(col_widths[col], min(len(val_str), DD_MAX_VAL_LEN))
col_groups = [[]]
tot_width = 0
for col in cols:
colwidth = col_widths[col] + BORDER_SZ
if tot_width + colwidth >= max_width:
col_groups.append([col])
tot_width = 0
else:
col_groups[-1].append(col)
tot_width += colwidth
col_group_strs = []
for i, col_group in enumerate(col_groups, 1):
if i == 1:
borders = '| {} ' + ('|' if len(col_groups) == 1 else '')
elif i < len(col_groups):
borders = ' {} '
else:
borders = ' {} |'
heading = borders.format(
COLSEP.join(pydoc.cram(c, col_widths[c]).center(col_widths[c])
for c in col_group))
hrule = '-' * len(heading)
cg_lines = [hrule, heading, hrule]
for j in range(len(egs)):
l = borders.format(
COLSEP.join(col_vals[c][j].center(col_widths[c]) for c in col_group))
cg_lines.append(l)
cg_lines.append(hrule)
col_group_strs.append('\n'.join(cg_lines))
lines.append('\n\n'.join(col_group_strs))
return lines
@formatter(linters.UncommonSignDetector)
def _format_warnings_dus(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for UncommonSignDetector LintResults."""
lines = [USD_PREAMBLE]
for warning, sample in zip(result.warnings, result.lint_samples):
if warning in suppress:
continue
num_unique_vals = sample.nums[0]
wstrs = []
for sign_str, sign_count in zip(sample.strings, sample.nums[1:]):
pct_w_sign = sign_count / num_unique_vals * 100
if pct_w_sign >= 1:
pct_str = PCT_FMT.format(pct_w_sign)
else:
pct_str = '< 1%'
wstrs.append(USD_WFMT.format(sign_str, pct_str))
lines.append(LI_SAMP.format(pformat(warning, quote=False),
', '.join(wstrs)))
return lines
@formatter(linters.TailedDistributionDetector)
def _format_warnings_don(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for TailedDistributionDetector LintResults."""
lines = [TDD_PREAMBLE]
for warning, sample in zip(result.warnings, result.lint_samples):
if warning in suppress:
continue
stats = sample.stats[0]
extremes = stats.id.split(',')
extremal = [getattr(stats, e) for e in extremes]
wstrs = [TDD_WFMT.format(e, float(ev)) for e, ev in zip(extremes, extremal)]
lines.append(LI_SAMP.format(pformat(warning, quote=False),
', '.join(wstrs)))
return lines
@formatter(linters.IntAsFloatDetector)
def _format_warnings_iaf(result, suppress=frozenset(), max_width=MAX_WIDTH):
"""String formatter for IntAsFloatDetector LintResults."""
wstrs = [LI.format(pformat(warning, quote=False))
for warning in result.warnings]
lines = [FLAG_PREAMBLE, '\n'.join(wstrs)]
return lines
linters.DateTimeAsStringDetector.DESCRIPTION = """
A feature flagged by this linter contains strings that might represent dates
or times.
This is a lint because feeding the string directly into a model will cause each
unique date[time] to become its own feature. This is fine if there are only a
few unique values but the linear progression of time would be better modeled if
the feature were represented as a number.
Quickfix: convert the feature to a timestamp.
"""
linters.TokenizableStringDetector.DESCRIPTION = """
A feature flagged by this linter often contains long strings that have more
than a handful of unique values.
This suggests that the feature might have compositional structure that is
may usefully be exposed to the model. For instance, a sentence may be better
understood as a sequence (or even set) of words.
Quickfix: [tokenize](https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html) the strings.
The tokens can then be used as, for instance, a
[bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model) or the inputs
to an [embedding layer](https://github.com/tflearn/tflearn/blob/master/examples/nlp/lstm.py).
"""
linters.NumberAsStringDetector.DESCRIPTION = """
A feature flagged by this linter often takes values that look like numbers.
For instance, it could contain simple floats, dollar values, or percents.
Quickfix: unless the feature represents a categorical value, it would be better
represented to the model as the number, itself.
"""
linters.ZipCodeAsNumberDetector.DESCRIPTION = """
A feature flagged by this linter is likely a zip code and should be represented
as a categorical value since there is no numerical relation between zip codes.
Quickfix: represent the zip code as a string.
"""
linters.NonNormalNumericFeatureDetector.DESCRIPTION = """
A feature flagged by this linter has a distribution that varies significantly
from the other numeric features.
Especially for linear models, poorly scaled features with high variance
(e.g., all but one are in the range [-10, 10] but one is in [0, 100000])
can wash out the effects of the other features.
Quickfix: use the [standard score](https://en.wikipedia.org/wiki/Standard_score)
of (at least) the flagged features.
"""
linters.IntAsFloatDetector.DESCRIPTION = """
While this is not, itself, a lint, it may be indicative of a feature that might
actually be categorical (and that the enum_threshold isn't set high enough).
"""
linters.EnumDetector.DESCRIPTION = """
A feature flagged by this linter is numeric but only takes on a few values.
If this is generally the case (and not just an artifact of most values being
missing), it might be helpful to treat the feature as a categorical variable and
treat each unique value as its own boolean feature.
Quickfix: split the feature into N boolean features or index the values and use
them as the input to an embedding layer.
"""
linters.UncommonListLengthDetector.DESCRIPTION = """
A feature flagged by this linter has `value` lists that are not smoothly
distributed with respect to their length. For instance, most `value`s could have
length 3 but one has length 2. This could be a typo.
Quickfix: ensure that you mean to have variable length lists and that the model
is equipped to handle them.
"""
linters.DuplicateExampleDetector.DESCRIPTION = """
This linter finds exactly duplicated Examples. It's possible that your data
generation process actually permits duplicates or they're the result of missing
entries. For spurious duplicates, a few are usually fine, but a large number or
Examples shared across train/val/test can be problematic and should be filtered.
Quickfix: remove all but one of each Example.
"""
linters.EmptyExampleDetector.DESCRIPTION = """
This linter detects completely empty examples. These are possibly indicative of
data entry errors.
Quickfix: remove all empty examples.
"""
linters.UncommonSignDetector.DESCRIPTION = """
A feature flagged by this linter has a handful of values that have a different
sign (+/-/0/nan) from the rest. These may be the result of incorrectly entering
the values or using a custom "missing" value like -999.
Quickfix: ensure that values are valid and, if not, replace them with a
standard missing value of either an empty values list or an explicit nan.
"""
linters.TailedDistributionDetector.DESCRIPTION = """
A feature flagged by this linter has an extremal value that significantly
affects the mean. This may be because the value is an outlier but it may also
be due to the extremal value being very common. In either case, however, it
would be beneficial to check the histograms to ensure that they follow the
expected distribution.
Quickfix: check the histograms of the feature values.
"""
linters.CircularDomainDetector.DESCRIPTION = """
A feature flagged by this linter is likely to contain values that wrap around.
For instance, angle (0 and 360 are close), hour, and latitude/longitude.
Feeding these directly into a linear model may yield incorrect results since
it does not take into account the modulus.
Quickfix: quantize the feature values and make each bucket its own
feature/embedding index.
"""