From c1eb39da106c1bb12c8bb731aabc6188d0c33f28 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 12 Apr 2023 01:22:13 +0530 Subject: [PATCH 01/50] create helper function Create helper function to convert rank 2 tensor to rank 1 tensor --- .../benchmarks/cloudml/criteo_tft/criteo.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index d2a0b652ca69..0537d60700d7 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -132,15 +132,14 @@ def preprocessing_fn(inputs): result = {'clicked': inputs['clicked']} for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - # TODO(https://github.com/apache/beam/issues/24902): - # Replace this boilerplate with a helper function. - # This is a SparseTensor because it is optional. Here we fill in a - # default value when it is missing. - feature = tft.sparse_tensor_to_dense_with_shape( - feature, [None, 1], default_value=-1) - # Reshaping from a batch of vectors of size 1 to a batch of scalars and - # adding a bucketized version. - feature = tf.squeeze(feature, axis=1) + def fill_in_missing(feature,default_value=-1): + feature= tf.sparse.SparseTensor(indices=feature.indices, values=feature.values, dense_shape=[feature.dense_shape[0], 1]) + feature = tf.sparse_to_dense(feature,default_value=default_value) + # Reshaping from a batch of vectors of size 1 to a batch of scalars and + # adding a bucketized version. + feature = tf.squeeze(feature, axis=1) + return feature + fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) for name in _CATEGORICAL_COLUMN_NAMES: From 333349e14b93dcad65c77fa31d064d1882408fbf Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 12 Apr 2023 02:34:29 +0530 Subject: [PATCH 02/50] formatting --- .../testing/benchmarks/cloudml/criteo_tft/criteo.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 0537d60700d7..ce36f1fc05e1 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -132,9 +132,13 @@ def preprocessing_fn(inputs): result = {'clicked': inputs['clicked']} for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - def fill_in_missing(feature,default_value=-1): - feature= tf.sparse.SparseTensor(indices=feature.indices, values=feature.values, dense_shape=[feature.dense_shape[0], 1]) - feature = tf.sparse_to_dense(feature,default_value=default_value) + def fill_in_missing(feature, default_value=-1): + feature = tf.sparse.SparseTensor( + indices=feature.indices, + values=feature.values, + dense_shape=[feature.dense_shape[0], 1]) + feature = tf.sparse_to_dense(feature, default_value=default_value) + # Reshaping from a batch of vectors of size 1 to a batch of scalars and # adding a bucketized version. feature = tf.squeeze(feature, axis=1) From a6805e415266e1b93f2a9acce5942ac590c1e4aa Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 12 Apr 2023 02:42:27 +0530 Subject: [PATCH 03/50] formatting again --- .../testing/benchmarks/cloudml/criteo_tft/criteo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index ce36f1fc05e1..4571bd2a6dbd 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -132,17 +132,18 @@ def preprocessing_fn(inputs): result = {'clicked': inputs['clicked']} for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] + def fill_in_missing(feature, default_value=-1): feature = tf.sparse.SparseTensor( indices=feature.indices, values=feature.values, dense_shape=[feature.dense_shape[0], 1]) feature = tf.sparse_to_dense(feature, default_value=default_value) - # Reshaping from a batch of vectors of size 1 to a batch of scalars and # adding a bucketized version. feature = tf.squeeze(feature, axis=1) return feature + fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) From 5886181ad8ec3c1dbf87954abd125252ad3ce713 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 12 Apr 2023 21:41:54 +0530 Subject: [PATCH 04/50] re assigning it back to the feature --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 4571bd2a6dbd..40079bff82e3 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -144,7 +144,7 @@ def fill_in_missing(feature, default_value=-1): feature = tf.squeeze(feature, axis=1) return feature - fill_in_missing(feature) + feature=fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) for name in _CATEGORICAL_COLUMN_NAMES: From 3d0ded9a1eec454cb6e2a4e51c3a75b93830fa4c Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 12 Apr 2023 21:52:39 +0530 Subject: [PATCH 05/50] trailing whitespace --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 40079bff82e3..7606a978995d 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -132,7 +132,7 @@ def preprocessing_fn(inputs): result = {'clicked': inputs['clicked']} for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - + def fill_in_missing(feature, default_value=-1): feature = tf.sparse.SparseTensor( indices=feature.indices, From b22e3a7451eebea1c760037a3dd587532f0d9162 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Thu, 13 Apr 2023 01:30:10 +0530 Subject: [PATCH 06/50] whitespace changes --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 7606a978995d..4ee9c1b51802 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -144,7 +144,7 @@ def fill_in_missing(feature, default_value=-1): feature = tf.squeeze(feature, axis=1) return feature - feature=fill_in_missing(feature) + feature = fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) for name in _CATEGORICAL_COLUMN_NAMES: From bbe227ad3fdec03c0fc85e24b924f689f9312b84 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Thu, 13 Apr 2023 01:32:17 +0530 Subject: [PATCH 07/50] changes --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 4ee9c1b51802..a656d156d725 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -132,7 +132,7 @@ def preprocessing_fn(inputs): result = {'clicked': inputs['clicked']} for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - + def fill_in_missing(feature, default_value=-1): feature = tf.sparse.SparseTensor( indices=feature.indices, From b28d6247b749aac22e14545572f349b2e820db60 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Thu, 13 Apr 2023 02:30:01 +0530 Subject: [PATCH 08/50] replacing whitespace by tabs --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index a656d156d725..1cd7c15c8a49 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -133,7 +133,7 @@ def preprocessing_fn(inputs): for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - def fill_in_missing(feature, default_value=-1): + def fill_in_missing(feature, default_value=-1): feature = tf.sparse.SparseTensor( indices=feature.indices, values=feature.values, From 5b049900dabc4e6a7d1fa18042439d04c1d997dd Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Thu, 13 Apr 2023 02:48:58 +0530 Subject: [PATCH 09/50] Update criteo.py --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 1cd7c15c8a49..a656d156d725 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -133,7 +133,7 @@ def preprocessing_fn(inputs): for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - def fill_in_missing(feature, default_value=-1): + def fill_in_missing(feature, default_value=-1): feature = tf.sparse.SparseTensor( indices=feature.indices, values=feature.values, From a4dddc9ac7a6f47b0e84809f6dc1241dcf48ad08 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sat, 15 Apr 2023 00:33:42 +0530 Subject: [PATCH 10/50] sparse_to_dense syntax has been changed in tf2.0 --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index a656d156d725..aa4028b3230d 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -138,7 +138,7 @@ def fill_in_missing(feature, default_value=-1): indices=feature.indices, values=feature.values, dense_shape=[feature.dense_shape[0], 1]) - feature = tf.sparse_to_dense(feature, default_value=default_value) + feature = tf.sparse.to_dense(feature, default_value=default_value) # Reshaping from a batch of vectors of size 1 to a batch of scalars and # adding a bucketized version. feature = tf.squeeze(feature, axis=1) From 3595e06c355fdde536fd75949ffaf2a09df449c0 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 2 May 2023 00:46:36 +0530 Subject: [PATCH 11/50] write unit test to ensure fill_in_missing --- .../benchmarks/cloudml/criteo_tft/criteo.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index aa4028b3230d..19d0b0401df8 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -108,7 +108,7 @@ def make_input_feature_spec(include_label=True): result[name] = tf.io.VarLenFeature(dtype=tf.string) return result - + def make_preprocessing_fn(frequency_threshold): """Creates a preprocessing function for criteo. @@ -160,3 +160,21 @@ def fill_in_missing(feature, default_value=-1): return result return preprocessing_fn + + +@pytest.mark.uses_tft +class FillInMissingTest(unittest.TestCase): + def test_fill_in_missing(self): + # Create a rank 2 sparse tensor with missing values + indices = np.array([[0, 0], [0, 2], [1, 1], [2, 0]]) + values = np.array([1, 2, 3, 4]) + dense_shape = np.array([3, 3]) + sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) + + # Fill in missing values with -1 + filled_tensor = fill_in_missing(sparse_tensor, -1) + + # Convert to a dense tensor and check the values + expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) + actual_output = filled_tensor.numpy() + self.assertEqual(expected_output, actual_output) From af4789ad3d55f019275b60918f035dab32239ee0 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 2 May 2023 01:02:22 +0530 Subject: [PATCH 12/50] indentation --- .../benchmarks/cloudml/criteo_tft/criteo.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 19d0b0401df8..ec4f7f54a884 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -165,16 +165,18 @@ def fill_in_missing(feature, default_value=-1): @pytest.mark.uses_tft class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): - # Create a rank 2 sparse tensor with missing values - indices = np.array([[0, 0], [0, 2], [1, 1], [2, 0]]) - values = np.array([1, 2, 3, 4]) - dense_shape = np.array([3, 3]) - sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) - - # Fill in missing values with -1 - filled_tensor = fill_in_missing(sparse_tensor, -1) - - # Convert to a dense tensor and check the values - expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) - actual_output = filled_tensor.numpy() - self.assertEqual(expected_output, actual_output) + # Create a rank 2 sparse tensor with missing values + indices = np.array([[0, 0], [0, 2], [1, 1], [2, 0]]) + values = np.array([1, 2, 3, 4]) + dense_shape = np.array([3, 3]) + sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) + + # Fill in missing values with -1 + filled_tensor = fill_in_missing(sparse_tensor, -1) + + # Convert to a dense tensor and check the values + expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) + actual_output = filled_tensor.numpy() + self.assertEqual(expected_output, actual_output) + + From 1989c5c077d05846e344d12ed1fa841e4503c3aa Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 12 May 2023 03:03:02 +0530 Subject: [PATCH 13/50] Create criteo_test.py --- .../benchmarks/cloudml/criteo_tft/criteo_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py new file mode 100644 index 000000000000..b38e9edbbcfe --- /dev/null +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -0,0 +1,16 @@ +@pytest.mark.uses_tft +class FillInMissingTest(unittest.TestCase): + def test_fill_in_missing(self): + # Create a rank 2 sparse tensor with missing values + indices = np.array([[0, 0], [0, 2], [1, 1], [2, 0]]) + values = np.array([1, 2, 3, 4]) + dense_shape = np.array([3, 3]) + sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) + + # Fill in missing values with -1 + filled_tensor = fill_in_missing(sparse_tensor, -1) + + # Convert to a dense tensor and check the values + expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) + actual_output = filled_tensor.numpy() + self.assertEqual(expected_output, actual_output) From db855bfb0b6d468719800647b18c9d6d2323c501 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 12 May 2023 03:04:19 +0530 Subject: [PATCH 14/50] Update criteo.py --- .../benchmarks/cloudml/criteo_tft/criteo.py | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index ec4f7f54a884..d5f1be707847 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -160,23 +160,4 @@ def fill_in_missing(feature, default_value=-1): return result return preprocessing_fn - - -@pytest.mark.uses_tft -class FillInMissingTest(unittest.TestCase): - def test_fill_in_missing(self): - # Create a rank 2 sparse tensor with missing values - indices = np.array([[0, 0], [0, 2], [1, 1], [2, 0]]) - values = np.array([1, 2, 3, 4]) - dense_shape = np.array([3, 3]) - sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) - - # Fill in missing values with -1 - filled_tensor = fill_in_missing(sparse_tensor, -1) - - # Convert to a dense tensor and check the values - expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) - actual_output = filled_tensor.numpy() - self.assertEqual(expected_output, actual_output) - - + From b54a881dc1dcc3f0da25905c56d1d94992dc3e06 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 12 May 2023 03:05:09 +0530 Subject: [PATCH 15/50] add license --- .../cloudml/criteo_tft/criteo_test.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index b38e9edbbcfe..4759815cf388 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -1,3 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + @pytest.mark.uses_tft class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): From b6217e94e2427563195db10894e0a2b0162dd53e Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 12 May 2023 04:19:00 +0530 Subject: [PATCH 16/50] import statements --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 4759815cf388..cd25625b6058 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -15,6 +15,9 @@ # limitations under the License. # +import unittest +import pytest + @pytest.mark.uses_tft class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): From 6bcdded5acd3f6b1603178baa24de4cfbe6394c9 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 17 May 2023 22:00:22 +0530 Subject: [PATCH 17/50] skip unit test --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index cd25625b6058..dc657b498c89 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -17,8 +17,10 @@ import unittest import pytest +import numpy as np @pytest.mark.uses_tft +@unittest.skipIf(dlp_v2 is None, 'GCP dependencies are not installed') class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): # Create a rank 2 sparse tensor with missing values From c2f0f38e3dce6820fd099e5d9000062f6c53a5c7 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 17 May 2023 22:31:56 +0530 Subject: [PATCH 18/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index dc657b498c89..1950c0512437 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -18,6 +18,10 @@ import unittest import pytest import numpy as np +try: + from google.cloud import dlp_v2 +except ImportError: + dlp_v2 = None @pytest.mark.uses_tft @unittest.skipIf(dlp_v2 is None, 'GCP dependencies are not installed') From 92eee6f360b8fbc0494a2acaa97ce76bce399196 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 17 May 2023 22:52:05 +0530 Subject: [PATCH 19/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 1950c0512437..1d4641664ca9 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -19,12 +19,12 @@ import pytest import numpy as np try: - from google.cloud import dlp_v2 + import tensorflow_transform as tft except ImportError: - dlp_v2 = None + tft = None @pytest.mark.uses_tft -@unittest.skipIf(dlp_v2 is None, 'GCP dependencies are not installed') +@unittest.skipIf(tft is None, 'tft dependencies are not installed') class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): # Create a rank 2 sparse tensor with missing values From b445beb1f3ee0b1638d0f2dec8cffbc2a2634394 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Thu, 18 May 2023 21:40:44 +0530 Subject: [PATCH 20/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 1d4641664ca9..89c2f0412e92 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -18,6 +18,7 @@ import unittest import pytest import numpy as np +import tensorflow as tf try: import tensorflow_transform as tft except ImportError: From 015b12e9dbb5ac120e7a74d07f58c6460f766cc9 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 19 May 2023 01:07:56 +0530 Subject: [PATCH 21/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 89c2f0412e92..1fee78ba9a79 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -18,14 +18,15 @@ import unittest import pytest import numpy as np -import tensorflow as tf try: import tensorflow_transform as tft + import tensorflow as tf except ImportError: tft = None + tf=None @pytest.mark.uses_tft -@unittest.skipIf(tft is None, 'tft dependencies are not installed') +@unittest.skipIf(tft is None, 'Missing dependencies. ',tf is None) class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): # Create a rank 2 sparse tensor with missing values From a0a8d274666de1a6a02e9b0f3f7d84c4ee81f7a1 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 19 May 2023 01:30:10 +0530 Subject: [PATCH 22/50] skipif syntax changes --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 1fee78ba9a79..36bd5684374c 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -26,7 +26,7 @@ tf=None @pytest.mark.uses_tft -@unittest.skipIf(tft is None, 'Missing dependencies. ',tf is None) +@unittest.skipIf(tft is None or tf is None, 'Missing dependencies. ') class FillInMissingTest(unittest.TestCase): def test_fill_in_missing(self): # Create a rank 2 sparse tensor with missing values From c1ca08c65b377d9546f4b4b1677cc8411aaf3d89 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 19 May 2023 03:18:21 +0530 Subject: [PATCH 23/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 36bd5684374c..a4e7864c93fb 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -18,6 +18,7 @@ import unittest import pytest import numpy as np +from .criteo import fill_in_missing try: import tensorflow_transform as tft import tensorflow as tf From 5b34941da7d725e4adbbd8958bee7ad313f7d295 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Fri, 19 May 2023 11:46:54 +0530 Subject: [PATCH 24/50] Update criteo.py --- .../benchmarks/cloudml/criteo_tft/criteo.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index d5f1be707847..354dfddd2aed 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -20,8 +20,11 @@ from __future__ import division from __future__ import print_function -import tensorflow as tf -import tensorflow_transform as tft +try: + import tensorflow as tf + import tensorflow_transform as tft +except ImportError as e: + tf = None def _get_raw_categorical_column_name(column_idx): @@ -134,14 +137,15 @@ def preprocessing_fn(inputs): feature = inputs[name] def fill_in_missing(feature, default_value=-1): - feature = tf.sparse.SparseTensor( - indices=feature.indices, - values=feature.values, - dense_shape=[feature.dense_shape[0], 1]) - feature = tf.sparse.to_dense(feature, default_value=default_value) - # Reshaping from a batch of vectors of size 1 to a batch of scalars and - # adding a bucketized version. - feature = tf.squeeze(feature, axis=1) + if tf!=None: + feature = tf.sparse.SparseTensor( + indices=feature.indices, + values=feature.values, + dense_shape=[feature.dense_shape[0], 1]) + feature = tf.sparse.to_dense(feature, default_value=default_value) + # Reshaping from a batch of vectors of size 1 to a batch of scalars and + # adding a bucketized version. + feature = tf.squeeze(feature, axis=1) return feature feature = fill_in_missing(feature) From 2be9b63abedd227953898e7240520b9dbfd41bde Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 30 May 2023 22:56:36 +0530 Subject: [PATCH 25/50] absolute import --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index a4e7864c93fb..aba8f7ffc26b 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -18,13 +18,14 @@ import unittest import pytest import numpy as np -from .criteo import fill_in_missing try: import tensorflow_transform as tft import tensorflow as tf + from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing except ImportError: tft = None tf=None + fill_in_missing=None @pytest.mark.uses_tft @unittest.skipIf(tft is None or tf is None, 'Missing dependencies. ') @@ -37,7 +38,9 @@ def test_fill_in_missing(self): sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) # Fill in missing values with -1 - filled_tensor = fill_in_missing(sparse_tensor, -1) + filled_tensor=[] + if fill_in_missing!=None: + filled_tensor = fill_in_missing(sparse_tensor, -1) # Convert to a dense tensor and check the values expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) From a31a0a44f24a516ab537e3836a7dce08342a22a1 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 11:13:48 +0530 Subject: [PATCH 26/50] whitespace changes --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 354dfddd2aed..25d3f3d6aa77 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -111,7 +111,7 @@ def make_input_feature_spec(include_label=True): result[name] = tf.io.VarLenFeature(dtype=tf.string) return result - + def make_preprocessing_fn(frequency_threshold): """Creates a preprocessing function for criteo. From 9d3550d22dbcf4a92e8b5494462b6559147cb630 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 11:33:43 +0530 Subject: [PATCH 27/50] linter changes --- .../benchmarks/cloudml/criteo_tft/criteo.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 25d3f3d6aa77..fcc84fb63668 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -136,19 +136,19 @@ def preprocessing_fn(inputs): for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - def fill_in_missing(feature, default_value=-1): - if tf!=None: + def fill_in_missing(feature, default_value=-1): + if tfi not None: feature = tf.sparse.SparseTensor( indices=feature.indices, values=feature.values, dense_shape=[feature.dense_shape[0], 1]) feature = tf.sparse.to_dense(feature, default_value=default_value) - # Reshaping from a batch of vectors of size 1 to a batch of scalars and - # adding a bucketized version. + # Reshaping from a batch of vectors of size 1 to a batch of + # scalar and adding a bucketized version. feature = tf.squeeze(feature, axis=1) return feature - - feature = fill_in_missing(feature) + + feature = fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) for name in _CATEGORICAL_COLUMN_NAMES: @@ -164,4 +164,4 @@ def fill_in_missing(feature, default_value=-1): return result return preprocessing_fn - + From 01d7dcacb9056b5435b1357c33d8a974c538e2dc Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 11:50:18 +0530 Subject: [PATCH 28/50] indentation --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index fcc84fb63668..bbb64728ed9d 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -136,7 +136,7 @@ def preprocessing_fn(inputs): for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - def fill_in_missing(feature, default_value=-1): + def fill_in_missing(feature, default_value=-1): if tfi not None: feature = tf.sparse.SparseTensor( indices=feature.indices, From e018e161ae7d9c61d0a0091ab0232556146caa5d Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 11:50:44 +0530 Subject: [PATCH 29/50] linter --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index aba8f7ffc26b..624a6a060366 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -39,7 +39,7 @@ def test_fill_in_missing(self): # Fill in missing values with -1 filled_tensor=[] - if fill_in_missing!=None: + if fill_in_missingis not None: filled_tensor = fill_in_missing(sparse_tensor, -1) # Convert to a dense tensor and check the values From 223e7836947a10222f6492bd984c77c16bd4de44 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 12:23:57 +0530 Subject: [PATCH 30/50] spacing --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 624a6a060366..e8f836f1f32e 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -39,7 +39,7 @@ def test_fill_in_missing(self): # Fill in missing values with -1 filled_tensor=[] - if fill_in_missingis not None: + if fill_in_missing is not None: filled_tensor = fill_in_missing(sparse_tensor, -1) # Convert to a dense tensor and check the values From dff0689e35bff37b33882cd124994eb7b45dec99 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 12:32:04 +0530 Subject: [PATCH 31/50] Update criteo.py --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index bbb64728ed9d..1029c359c290 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -137,7 +137,7 @@ def preprocessing_fn(inputs): feature = inputs[name] def fill_in_missing(feature, default_value=-1): - if tfi not None: + if tf is not None: feature = tf.sparse.SparseTensor( indices=feature.indices, values=feature.values, From 084541a69dff25ae9880af2485d9858085773512 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 12:32:39 +0530 Subject: [PATCH 32/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index e8f836f1f32e..bbe7966b4d46 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -24,8 +24,8 @@ from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing except ImportError: tft = None - tf=None - fill_in_missing=None + tf = None + fill_in_missing = None @pytest.mark.uses_tft @unittest.skipIf(tft is None or tf is None, 'Missing dependencies. ') @@ -38,7 +38,7 @@ def test_fill_in_missing(self): sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) # Fill in missing values with -1 - filled_tensor=[] + filled_tensor = [] if fill_in_missing is not None: filled_tensor = fill_in_missing(sparse_tensor, -1) From aaedd64580491d9716d8a40c0d77344fcdf637d0 Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 12:40:11 +0530 Subject: [PATCH 33/50] Update criteo.py --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 1029c359c290..a5c621403b80 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -148,7 +148,7 @@ def fill_in_missing(feature, default_value=-1): feature = tf.squeeze(feature, axis=1) return feature - feature = fill_in_missing(feature) + feature = fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) for name in _CATEGORICAL_COLUMN_NAMES: From 15be29cc7409b272cd27929440c8b82fb85ff8ec Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 12:40:34 +0530 Subject: [PATCH 34/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index bbe7966b4d46..d8031564c531 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -27,6 +27,7 @@ tf = None fill_in_missing = None + @pytest.mark.uses_tft @unittest.skipIf(tft is None or tf is None, 'Missing dependencies. ') class FillInMissingTest(unittest.TestCase): From 38ba741cc7f465924b0163e4503e4b817e20ad3b Mon Sep 17 00:00:00 2001 From: Smeet nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 25 Jun 2023 12:58:50 +0530 Subject: [PATCH 35/50] Update criteo.py --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index a5c621403b80..789fb17d7594 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -164,4 +164,3 @@ def fill_in_missing(feature, default_value=-1): return result return preprocessing_fn - From e5238ec385d18dcf7f9b7ab7b7219fe40b89fb09 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 5 Jul 2023 21:13:40 +0530 Subject: [PATCH 36/50] fix import issue --- .../benchmarks/cloudml/criteo_tft/criteo.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 789fb17d7594..6125834fa89b 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -113,6 +113,20 @@ def make_input_feature_spec(include_label=True): return result +def fill_in_missing(feature, default_value=-1): + if tf is not None: + feature = tf.sparse.SparseTensor( + indices=feature.indices, + values=feature.values, + dense_shape=[feature.dense_shape[0], 1]) + feature = tf.sparse.to_dense(feature, default_value=default_value) + # Reshaping from a batch of vectors of size 1 to a batch of + # scalar and adding a bucketized version. + feature = tf.squeeze(feature, axis=1) + + return feature + + def make_preprocessing_fn(frequency_threshold): """Creates a preprocessing function for criteo. @@ -135,19 +149,6 @@ def preprocessing_fn(inputs): result = {'clicked': inputs['clicked']} for name in _INTEGER_COLUMN_NAMES: feature = inputs[name] - - def fill_in_missing(feature, default_value=-1): - if tf is not None: - feature = tf.sparse.SparseTensor( - indices=feature.indices, - values=feature.values, - dense_shape=[feature.dense_shape[0], 1]) - feature = tf.sparse.to_dense(feature, default_value=default_value) - # Reshaping from a batch of vectors of size 1 to a batch of - # scalar and adding a bucketized version. - feature = tf.squeeze(feature, axis=1) - return feature - feature = fill_in_missing(feature) result[name] = feature result[name + '_bucketized'] = tft.bucketize(feature, _NUM_BUCKETS) From 3496d5964d700c5b728b9ca58725e926bef7537f Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 5 Jul 2023 21:24:45 +0530 Subject: [PATCH 37/50] indentation --- .../benchmarks/cloudml/criteo_tft/criteo.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 6125834fa89b..15bd2a5dab05 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -114,17 +114,17 @@ def make_input_feature_spec(include_label=True): def fill_in_missing(feature, default_value=-1): - if tf is not None: - feature = tf.sparse.SparseTensor( - indices=feature.indices, - values=feature.values, - dense_shape=[feature.dense_shape[0], 1]) - feature = tf.sparse.to_dense(feature, default_value=default_value) - # Reshaping from a batch of vectors of size 1 to a batch of - # scalar and adding a bucketized version. - feature = tf.squeeze(feature, axis=1) + if tf is not None: + feature = tf.sparse.SparseTensor( + indices=feature.indices, + values=feature.values, + dense_shape=[feature.dense_shape[0], 1]) + feature = tf.sparse.to_dense(feature, default_value=default_value) + # Reshaping from a batch of vectors of size 1 to a batch of + # scalar and adding a bucketized version. + feature = tf.squeeze(feature, axis=1) - return feature + return feature def make_preprocessing_fn(frequency_threshold): From 8ca9a5d21200030d3cc2ad9273b03193227abd50 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 5 Jul 2023 22:20:46 +0530 Subject: [PATCH 38/50] spacing issues --- .../apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 15bd2a5dab05..69e9792fe9bd 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -123,7 +123,7 @@ def fill_in_missing(feature, default_value=-1): # Reshaping from a batch of vectors of size 1 to a batch of # scalar and adding a bucketized version. feature = tf.squeeze(feature, axis=1) - + return feature From 9268b74bc73e697781290081e7a00bca7647e04b Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Wed, 5 Jul 2023 22:33:59 +0530 Subject: [PATCH 39/50] lint issues --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index d8031564c531..cf00417de53d 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -16,8 +16,9 @@ # import unittest -import pytest import numpy as np +import pytest + try: import tensorflow_transform as tft import tensorflow as tf From ff25761a6557b2492eb0422ddca27c174a2cb4fd Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Sat, 8 Jul 2023 02:08:13 +0530 Subject: [PATCH 40/50] add space --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index cf00417de53d..b4c74e41d209 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -16,6 +16,7 @@ # import unittest + import numpy as np import pytest From e186f890e98677a38777774f2d0ca63ec5b9dca8 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Sat, 8 Jul 2023 02:33:04 +0530 Subject: [PATCH 41/50] add --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index b4c74e41d209..f03968916774 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -49,3 +49,7 @@ def test_fill_in_missing(self): expected_output = np.array([1, -1, 2, -1, -1, -1, 4, -1, -1]) actual_output = filled_tensor.numpy() self.assertEqual(expected_output, actual_output) + + +if __name__ == '__main__': + unittest.main() From 1af5bf8d470a390cf9fa7a0f8496cbc047608fdb Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Sat, 8 Jul 2023 14:27:56 +0530 Subject: [PATCH 42/50] fix call --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index f03968916774..9b35922bf581 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -41,7 +41,7 @@ def test_fill_in_missing(self): sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) # Fill in missing values with -1 - filled_tensor = [] + filled_tensor = None if fill_in_missing is not None: filled_tensor = fill_in_missing(sparse_tensor, -1) From e65c46a5fff3f6d9e0bd35564693e12b30faa951 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Sat, 8 Jul 2023 19:24:46 +0530 Subject: [PATCH 43/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 9b35922bf581..ae77fdc7fcd0 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -41,7 +41,7 @@ def test_fill_in_missing(self): sparse_tensor = tf.sparse.SparseTensor(indices, values, dense_shape) # Fill in missing values with -1 - filled_tensor = None + filled_tensor = tf.Tensor() if fill_in_missing is not None: filled_tensor = fill_in_missing(sparse_tensor, -1) From cf828d5f780fa629ea884f176768e56fd2613081 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Sun, 9 Jul 2023 10:50:55 +0530 Subject: [PATCH 44/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index ae77fdc7fcd0..5b4825cfaaca 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -20,6 +20,8 @@ import numpy as np import pytest +from typing import Any, Callable, Optional + try: import tensorflow_transform as tft import tensorflow as tf @@ -27,7 +29,7 @@ except ImportError: tft = None tf = None - fill_in_missing = None + fill_in_missing = Optional[Callable[[tf.sparse.SparseTensor, int], tf.Tensor]] = None @pytest.mark.uses_tft From e8756c07a4c0116f7ee766c037c810102c258c4d Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 11 Jul 2023 23:56:34 +0530 Subject: [PATCH 45/50] Update criteo_test.py --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 5b4825cfaaca..ac0fa9e76acb 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -29,7 +29,7 @@ except ImportError: tft = None tf = None - fill_in_missing = Optional[Callable[[tf.sparse.SparseTensor, int], tf.Tensor]] = None + fill_in_missing : Optional[Callable[[tf.sparse.SparseTensor, int], tf.Tensor]] = None @pytest.mark.uses_tft From 8f9bbe973f2e156e1a8233fdb8273428b8c8b37f Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 18 Jul 2023 18:55:42 +0530 Subject: [PATCH 46/50] Update sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py Co-authored-by: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index ac0fa9e76acb..0e864b6e134a 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -27,9 +27,11 @@ import tensorflow as tf from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing except ImportError: - tft = None - tf = None - fill_in_missing : Optional[Callable[[tf.sparse.SparseTensor, int], tf.Tensor]] = None + except ImportError: + tft = None # type: ignore[assignment] + +if not tft: + raise unittest.SkipTest('tensorflow_transform is not installed.') @pytest.mark.uses_tft From 596f424f0bafae628f9db04b6d1c90df8800bc79 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 18 Jul 2023 18:58:26 +0530 Subject: [PATCH 47/50] remove try block --- .../benchmarks/cloudml/criteo_tft/criteo.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py index 69e9792fe9bd..b4fdda72fe90 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo.py @@ -20,11 +20,8 @@ from __future__ import division from __future__ import print_function -try: - import tensorflow as tf - import tensorflow_transform as tft -except ImportError as e: - tf = None +import tensorflow as tf +import tensorflow_transform as tft def _get_raw_categorical_column_name(column_idx): @@ -114,16 +111,14 @@ def make_input_feature_spec(include_label=True): def fill_in_missing(feature, default_value=-1): - if tf is not None: - feature = tf.sparse.SparseTensor( - indices=feature.indices, - values=feature.values, - dense_shape=[feature.dense_shape[0], 1]) - feature = tf.sparse.to_dense(feature, default_value=default_value) - # Reshaping from a batch of vectors of size 1 to a batch of - # scalar and adding a bucketized version. - feature = tf.squeeze(feature, axis=1) - + feature = tf.sparse.SparseTensor( + indices=feature.indices, + values=feature.values, + dense_shape=[feature.dense_shape[0], 1]) + feature = tf.sparse.to_dense(feature, default_value=default_value) + # Reshaping from a batch of vectors of size 1 to a batch of + # scalar and adding a bucketized version. + feature = tf.squeeze(feature, axis=1) return feature From 8216b0dcc688db5a32e856dcdaf47de867015979 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 18 Jul 2023 19:04:04 +0530 Subject: [PATCH 48/50] remove double except block --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 0e864b6e134a..1f1aef818f29 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -27,7 +27,6 @@ import tensorflow as tf from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing except ImportError: - except ImportError: tft = None # type: ignore[assignment] if not tft: From b595fcb86f284ac0c3c8dfc8fa80180275377adc Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 18 Jul 2023 22:27:45 +0530 Subject: [PATCH 49/50] remove imports and type assignment --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 1f1aef818f29..136e943915da 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -20,14 +20,12 @@ import numpy as np import pytest -from typing import Any, Callable, Optional - try: import tensorflow_transform as tft import tensorflow as tf from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing except ImportError: - tft = None # type: ignore[assignment] + tft = None if not tft: raise unittest.SkipTest('tensorflow_transform is not installed.') From f5b25b5cb58949de38672efd2f8cce005e476128 Mon Sep 17 00:00:00 2001 From: Smeet Nagda <81572407+smeet07@users.noreply.github.com> Date: Tue, 18 Jul 2023 22:45:08 +0530 Subject: [PATCH 50/50] remove trailing whitespace --- .../testing/benchmarks/cloudml/criteo_tft/criteo_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py index 136e943915da..00743c3fa7cb 100644 --- a/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py +++ b/sdks/python/apache_beam/testing/benchmarks/cloudml/criteo_tft/criteo_test.py @@ -25,7 +25,7 @@ import tensorflow as tf from apache_beam.testing.benchmarks.cloudml.criteo_tft.criteo import fill_in_missing except ImportError: - tft = None + tft = None if not tft: raise unittest.SkipTest('tensorflow_transform is not installed.')