Skip to content

Commit

Permalink
Fixing an issue when QuantilesSketch did not ignore np.nan values…
Browse files Browse the repository at this point in the history
… and weights and returned incorrect quantiles. `tft.quantiles` will now also ignore `np.nan`s.

PiperOrigin-RevId: 387629527
  • Loading branch information
iindyk authored and tf-transform-team committed Jul 29, 2021
1 parent cbb20d9 commit ea54bc2
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 11 deletions.
3 changes: 3 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

## Bug Fixes and Other Changes

* `tft.quantiles` now ignores NaNs in input values and weights. Previously,
NaNs would lead to incorrect boundaries calculation.

## Breaking Changes

## Deprecations
Expand Down
3 changes: 2 additions & 1 deletion tensorflow_transform/analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2114,10 +2114,11 @@ def quantiles(x: tf.Tensor,
name: Optional[str] = None) -> tf.Tensor:
"""Computes the quantile boundaries of a `Tensor` over the whole dataset.
quantile boundaries are computed using approximate quantiles,
Quantile boundaries are computed using approximate quantiles,
and error tolerance is specified using `epsilon`. The boundaries divide the
input tensor into approximately equal `num_buckets` parts.
See go/squawd for details, and how to control the error due to approximation.
NaN input values and values with NaN weights are ignored.
Args:
x: An input `Tensor`.
Expand Down
44 changes: 34 additions & 10 deletions tensorflow_transform/beam/bucketize_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,18 @@ def _construct_test_bucketization_parameters():
(range(1, 100), [25, 50, 75], False, 0.00001, True, True),
)
dtypes = (tf.int32, tf.int64, tf.float32, tf.float64, tf.double)
return (x + (dtype,) for x in args_without_dtype for dtype in dtypes)

args_with_dtype = [
# Tests for handling np.nan input values.
(list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01,
True, True, tf.float32),
(list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01,
False, True, tf.float32),
(list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01,
False, False, tf.float32),
]
return ([x + (dtype,) for x in args_without_dtype for dtype in dtypes] +
args_with_dtype)


class BucketizeIntegrationTest(tft_unit.TransformTestCase):
Expand Down Expand Up @@ -134,8 +145,10 @@ def preprocessing_fn(inputs):

# Sort the input based on value, index is used to create expected_data.
indexed_input = enumerate(test_inputs)

sorted_list = sorted(indexed_input, key=lambda p: p[1])
# We put all np.nans in the end of the list so that they get assigned to the
# last bucket.
sorted_list = sorted(
indexed_input, key=lambda p: np.inf if np.isnan(p[1]) else p[1])

# Expected data has the same size as input, one bucket per input value.
expected_data = [None] * len(test_inputs)
Expand Down Expand Up @@ -295,15 +308,18 @@ def compute_bucket(instance):
# Test for all numerical types, each type is in a separate testcase to
# increase parallelism of test shards and reduce test time.
@tft_unit.parameters(
(tf.int32,),
(tf.int64,),
(tf.float32,),
(tf.float64,),
(tf.double,),
(tf.int32, False),
(tf.int64, False),
(tf.float32, False),
(tf.float32, True),
(tf.float64, False),
(tf.float64, True),
(tf.double, False),
(tf.double, True),
# TODO(b/64836936): Enable test after bucket inconsistency is fixed.
# (tf.float16,)
# (tf.float16, False)
)
def testQuantileBucketsWithWeights(self, input_dtype):
def testQuantileBucketsWithWeights(self, input_dtype, with_nans):

def analyzer_fn(inputs):
return {
Expand All @@ -316,6 +332,14 @@ def analyzer_fn(inputs):
}

input_data = [{'x': [x], 'weights': [x / 100.]} for x in range(1, 3000)]
if with_nans:
input_data += [{
'x': [np.nan],
'weights': [100000]
}, {
'x': [100000],
'weights': [np.nan]
}]
input_metadata = tft_unit.metadata_from_feature_spec({
'x':
tf.io.FixedLenFeature(
Expand Down
2 changes: 2 additions & 0 deletions tensorflow_transform/mappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,6 +1707,8 @@ def bucketize(x: common_types.TensorType,
can be different than num_buckets hint, for example in case the number of
distinct values is smaller than num_buckets, or in cases where the
input values are not uniformly distributed.
NaN values are mapped to the last bucket. Values with NaN weights are
ignored in bucket boundaries calculation.
Raises:
TypeError: If num_buckets is not an int.
Expand Down

0 comments on commit ea54bc2

Please sign in to comment.