diff --git a/RELEASE.md b/RELEASE.md index 8fe85b94..f055b0cd 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -6,6 +6,9 @@ ## Bug Fixes and Other Changes +* `tft.quantiles` now ignores NaNs in input values and weights. Previously, + NaNs would lead to incorrect boundaries calculation. + ## Breaking Changes ## Deprecations diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py index 140f7cef..ba469fe6 100644 --- a/tensorflow_transform/analyzers.py +++ b/tensorflow_transform/analyzers.py @@ -2114,10 +2114,11 @@ def quantiles(x: tf.Tensor, name: Optional[str] = None) -> tf.Tensor: """Computes the quantile boundaries of a `Tensor` over the whole dataset. - quantile boundaries are computed using approximate quantiles, + Quantile boundaries are computed using approximate quantiles, and error tolerance is specified using `epsilon`. The boundaries divide the input tensor into approximately equal `num_buckets` parts. See go/squawd for details, and how to control the error due to approximation. + NaN input values and values with NaN weights are ignored. Args: x: An input `Tensor`. diff --git a/tensorflow_transform/beam/bucketize_integration_test.py b/tensorflow_transform/beam/bucketize_integration_test.py index 2c4feb81..118a819b 100644 --- a/tensorflow_transform/beam/bucketize_integration_test.py +++ b/tensorflow_transform/beam/bucketize_integration_test.py @@ -81,7 +81,18 @@ def _construct_test_bucketization_parameters(): (range(1, 100), [25, 50, 75], False, 0.00001, True, True), ) dtypes = (tf.int32, tf.int64, tf.float32, tf.float64, tf.double) - return (x + (dtype,) for x in args_without_dtype for dtype in dtypes) + + args_with_dtype = [ + # Tests for handling np.nan input values. + (list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01, + True, True, tf.float32), + (list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01, + False, True, tf.float32), + (list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01, + False, False, tf.float32), + ] + return ([x + (dtype,) for x in args_without_dtype for dtype in dtypes] + + args_with_dtype) class BucketizeIntegrationTest(tft_unit.TransformTestCase): @@ -134,8 +145,10 @@ def preprocessing_fn(inputs): # Sort the input based on value, index is used to create expected_data. indexed_input = enumerate(test_inputs) - - sorted_list = sorted(indexed_input, key=lambda p: p[1]) + # We put all np.nans in the end of the list so that they get assigned to the + # last bucket. + sorted_list = sorted( + indexed_input, key=lambda p: np.inf if np.isnan(p[1]) else p[1]) # Expected data has the same size as input, one bucket per input value. expected_data = [None] * len(test_inputs) @@ -295,15 +308,18 @@ def compute_bucket(instance): # Test for all numerical types, each type is in a separate testcase to # increase parallelism of test shards and reduce test time. @tft_unit.parameters( - (tf.int32,), - (tf.int64,), - (tf.float32,), - (tf.float64,), - (tf.double,), + (tf.int32, False), + (tf.int64, False), + (tf.float32, False), + (tf.float32, True), + (tf.float64, False), + (tf.float64, True), + (tf.double, False), + (tf.double, True), # TODO(b/64836936): Enable test after bucket inconsistency is fixed. - # (tf.float16,) + # (tf.float16, False) ) - def testQuantileBucketsWithWeights(self, input_dtype): + def testQuantileBucketsWithWeights(self, input_dtype, with_nans): def analyzer_fn(inputs): return { @@ -316,6 +332,14 @@ def analyzer_fn(inputs): } input_data = [{'x': [x], 'weights': [x / 100.]} for x in range(1, 3000)] + if with_nans: + input_data += [{ + 'x': [np.nan], + 'weights': [100000] + }, { + 'x': [100000], + 'weights': [np.nan] + }] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature( diff --git a/tensorflow_transform/mappers.py b/tensorflow_transform/mappers.py index db21ad1e..d5a924cb 100644 --- a/tensorflow_transform/mappers.py +++ b/tensorflow_transform/mappers.py @@ -1707,6 +1707,8 @@ def bucketize(x: common_types.TensorType, can be different than num_buckets hint, for example in case the number of distinct values is smaller than num_buckets, or in cases where the input values are not uniformly distributed. + NaN values are mapped to the last bucket. Values with NaN weights are + ignored in bucket boundaries calculation. Raises: TypeError: If num_buckets is not an int.