From 287e8d49c4b7349438af505b0d8bad13c590de6e Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 6 Feb 2024 19:01:37 -0600 Subject: [PATCH 1/2] FIX First set of fixes for pandas 2.0 support --- python/cuml/benchmark/datagen.py | 2 +- python/cuml/common/sparsefuncs.py | 3 ++- python/cuml/preprocessing/encoders.py | 2 +- python/cuml/tests/test_train_test_split.py | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py index 2f49ca292e..94acbb5c17 100644 --- a/python/cuml/benchmark/datagen.py +++ b/python/cuml/benchmark/datagen.py @@ -73,7 +73,7 @@ def _gen_data_regression( ) X_df = cudf.DataFrame(X_arr) - y_df = cudf.Series(y_arr) + y_df = cudf.Series(np.squeeze(y_arr)) return X_df, y_df diff --git a/python/cuml/common/sparsefuncs.py b/python/cuml/common/sparsefuncs.py index 4648163dc6..f50f70b550 100644 --- a/python/cuml/common/sparsefuncs.py +++ b/python/cuml/common/sparsefuncs.py @@ -160,8 +160,9 @@ def create_csr_matrix_from_count_df( doc_token_counts = count_df["doc_id"].value_counts().reset_index() del count_df + doc_token_counts = doc_token_counts.rename( - {"doc_id": "token_counts", "index": "doc_id"}, axis=1 + {"count": "token_counts"}, axis=1 ).sort_values(by="doc_id") token_counts = _insert_zeros( diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index cc27320490..c1dc8c6451 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -498,7 +498,7 @@ def inverse_transform(self, X): dropped_class_idx = Series(self.drop_idx_[feature]) dropped_class_mask = Series(cats).isin(cats[dropped_class_idx]) if len(cats) == 1: - inv = Series(Index(cats[0]).repeat(X.shape[0])) + inv = Series(Index(cp.squeeze(cats[0])).repeat(X.shape[0])) result[feature] = inv continue cats = cats[~dropped_class_mask] diff --git a/python/cuml/tests/test_train_test_split.py b/python/cuml/tests/test_train_test_split.py index b6dd4d7847..e0f450176b 100644 --- a/python/cuml/tests/test_train_test_split.py +++ b/python/cuml/tests/test_train_test_split.py @@ -48,7 +48,7 @@ def test_split_dataframe(train_size, shuffle): assert all(X_test.index.to_pandas() == y_test.index.to_pandas()) X_reconstructed = cudf.concat([X_train, X_test]).sort_values(by=["x"]) - y_reconstructed = y_train.append(y_test).sort_values() + y_reconstructed = cudf.concat([y_train, y_test]).sort_values() assert all(X_reconstructed.reset_index(drop=True) == X) out = y_reconstructed.reset_index(drop=True).values_host == y.values_host @@ -96,7 +96,7 @@ def test_split_column(): ) X_reconstructed = cudf.concat([X_train, X_test]).sort_values(by=["x"]) - y_reconstructed = y_train.append(y_test).sort_values() + y_reconstructed = cudf.concat([y_train, y_test]).sort_values() assert all( data From 4d64ea8f694140759bd3e7e81ab1d6349b57f6aa Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 7 Feb 2024 11:44:26 -0600 Subject: [PATCH 2/2] FIX correction of building a list instead of squeezing when index is built from 1 element --- python/cuml/preprocessing/encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index c1dc8c6451..46500b766a 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -498,7 +498,7 @@ def inverse_transform(self, X): dropped_class_idx = Series(self.drop_idx_[feature]) dropped_class_mask = Series(cats).isin(cats[dropped_class_idx]) if len(cats) == 1: - inv = Series(Index(cp.squeeze(cats[0])).repeat(X.shape[0])) + inv = Series(Index([cats[0]]).repeat(X.shape[0])) result[feature] = inv continue cats = cats[~dropped_class_mask]