Fixing an issue when QuantilesSketch did not ignore np.nan values…

… and weights and returned incorrect quantiles. `tft.quantiles` will now also ignore `np.nan`s. PiperOrigin-RevId: 387629527
tensorflow · Jul 29, 2021 · ea54bc2 · ea54bc2
1 parent cbb20d9
commit ea54bc2
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 11 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -6,6 +6,9 @@
 
 ## Bug Fixes and Other Changes
 
+*  `tft.quantiles` now ignores NaNs in input values and weights. Previously,
+   NaNs would lead to incorrect boundaries calculation.
+
 ## Breaking Changes
 
 ## Deprecations

diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -2114,10 +2114,11 @@ def quantiles(x: tf.Tensor,
               name: Optional[str] = None) -> tf.Tensor:
   """Computes the quantile boundaries of a `Tensor` over the whole dataset.
 
-  quantile boundaries are computed using approximate quantiles,
+  Quantile boundaries are computed using approximate quantiles,
   and error tolerance is specified using `epsilon`. The boundaries divide the
   input tensor into approximately equal `num_buckets` parts.
   See go/squawd for details, and how to control the error due to approximation.
+  NaN input values and values with NaN weights are ignored.
 
   Args:
     x: An input `Tensor`.

diff --git a/tensorflow_transform/beam/bucketize_integration_test.py b/tensorflow_transform/beam/bucketize_integration_test.py
@@ -81,7 +81,18 @@ def _construct_test_bucketization_parameters():
       (range(1, 100), [25, 50, 75], False, 0.00001, True, True),
   )
   dtypes = (tf.int32, tf.int64, tf.float32, tf.float64, tf.double)
-  return (x + (dtype,) for x in args_without_dtype for dtype in dtypes)
+
+  args_with_dtype = [
+      # Tests for handling np.nan input values.
+      (list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01,
+       True, True, tf.float32),
+      (list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01,
+       False, True, tf.float32),
+      (list(range(1, 100)) + [np.nan] * 10, [25, 50, 75], False, 0.01,
+       False, False, tf.float32),
+  ]
+  return ([x + (dtype,) for x in args_without_dtype for dtype in dtypes] +
+          args_with_dtype)
 
 
 class BucketizeIntegrationTest(tft_unit.TransformTestCase):
@@ -134,8 +145,10 @@ def preprocessing_fn(inputs):
 
     # Sort the input based on value, index is used to create expected_data.
     indexed_input = enumerate(test_inputs)
-
-    sorted_list = sorted(indexed_input, key=lambda p: p[1])
+    # We put all np.nans in the end of the list so that they get assigned to the
+    # last bucket.
+    sorted_list = sorted(
+        indexed_input, key=lambda p: np.inf if np.isnan(p[1]) else p[1])
 
     # Expected data has the same size as input, one bucket per input value.
     expected_data = [None] * len(test_inputs)
@@ -295,15 +308,18 @@ def compute_bucket(instance):
   # Test for all numerical types, each type is in a separate testcase to
   # increase parallelism of test shards and reduce test time.
   @tft_unit.parameters(
-      (tf.int32,),
-      (tf.int64,),
-      (tf.float32,),
-      (tf.float64,),
-      (tf.double,),
+      (tf.int32, False),
+      (tf.int64, False),
+      (tf.float32, False),
+      (tf.float32, True),
+      (tf.float64, False),
+      (tf.float64, True),
+      (tf.double, False),
+      (tf.double, True),
       # TODO(b/64836936): Enable test after bucket inconsistency is fixed.
-      # (tf.float16,)
+      # (tf.float16, False)
   )
-  def testQuantileBucketsWithWeights(self, input_dtype):
+  def testQuantileBucketsWithWeights(self, input_dtype, with_nans):
 
     def analyzer_fn(inputs):
       return {
@@ -316,6 +332,14 @@ def analyzer_fn(inputs):
       }
 
     input_data = [{'x': [x], 'weights': [x / 100.]} for x in range(1, 3000)]
+    if with_nans:
+      input_data += [{
+          'x': [np.nan],
+          'weights': [100000]
+      }, {
+          'x': [100000],
+          'weights': [np.nan]
+      }]
     input_metadata = tft_unit.metadata_from_feature_spec({
         'x':
             tf.io.FixedLenFeature(

diff --git a/tensorflow_transform/mappers.py b/tensorflow_transform/mappers.py
@@ -1707,6 +1707,8 @@ def bucketize(x: common_types.TensorType,
     can be different than num_buckets hint, for example in case the number of
     distinct values is smaller than num_buckets, or in cases where the
     input values are not uniformly distributed.
+    NaN values are mapped to the last bucket. Values with NaN weights are
+    ignored in bucket boundaries calculation.
 
   Raises:
     TypeError: If num_buckets is not an int.