From f74ba5d7724b993b94ebbfc408b3ce787fc17698 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:36:38 -0800 Subject: [PATCH 1/4] Correctly accept a pd.CategoricalDtype(pd.IntervalDtype(...), ...) --- python/cudf/cudf/core/column/categorical.py | 27 ++++++++++------- python/cudf/cudf/core/column/column.py | 32 +++++++++++++-------- python/cudf/cudf/tests/test_categorical.py | 6 ++++ 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a0cf38c6f51..889d6dacf7b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1095,17 +1095,22 @@ def as_categorical_column(self, dtype: Dtype) -> Self: raise ValueError("dtype must be CategoricalDtype") if not isinstance(self.categories, type(dtype.categories._column)): - # If both categories are of different Column types, - # return a column full of Nulls. - codes = cast( - cudf.core.column.numerical.NumericalColumn, - column.as_column( - _DEFAULT_CATEGORICAL_VALUE, - length=self.size, - dtype=self.codes.dtype, - ), - ) - codes = as_unsigned_codes(len(dtype.categories), codes) + if isinstance( + self.categories.dtype, cudf.StructDtype + ) and isinstance(dtype.categories.dtype, cudf.IntervalDtype): + codes = self.codes + else: + # Otherwise If both categories are of different Column types, + # return a column full of Nulls. + codes = cast( + cudf.core.column.numerical.NumericalColumn, + column.as_column( + _DEFAULT_CATEGORICAL_VALUE, + length=self.size, + dtype=self.codes.dtype, + ), + ) + codes = as_unsigned_codes(len(dtype.categories), codes) return type(self)( data=self.data, # type: ignore[arg-type] size=self.size, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cc07af0f669..cb40c50651f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2024,18 +2024,26 @@ def as_column( if isinstance(arbitrary.dtype, pd.DatetimeTZDtype): new_tz = get_compatible_timezone(arbitrary.dtype) arbitrary = arbitrary.astype(new_tz) - if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ): - new_tz = get_compatible_timezone( - arbitrary.dtype.categories.dtype - ) - new_cats = arbitrary.dtype.categories.astype(new_tz) - new_dtype = pd.CategoricalDtype( - categories=new_cats, ordered=arbitrary.dtype.ordered - ) - arbitrary = arbitrary.astype(new_dtype) - + if isinstance(arbitrary.dtype, pd.CategoricalDtype): + if isinstance( + arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype + ): + new_tz = get_compatible_timezone( + arbitrary.dtype.categories.dtype + ) + new_cats = arbitrary.dtype.categories.astype(new_tz) + new_dtype = pd.CategoricalDtype( + categories=new_cats, ordered=arbitrary.dtype.ordered + ) + arbitrary = arbitrary.astype(new_dtype) + elif ( + isinstance( + arbitrary.dtype.categories.dtype, pd.IntervalDtype + ) + and dtype is None + ): + # Conversion to arrow converts IntervalDtype to StructDtype + dtype = cudf.CategoricalDtype.from_pandas(arbitrary.dtype) return as_column( pa.array(arbitrary, from_pandas=True), nan_as_null=nan_as_null, diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index db24fdd2a29..6dc3af4eee5 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -950,3 +950,9 @@ def test_index_set_categories(ordered): expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered) result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered) assert_eq(result, expected) + + +def test_categorical_interval_pandas_roundtrip(): + expected = cudf.Series(cudf.interval_range(0, 5)).astype("category") + result = cudf.Series.from_pandas(expected.to_pandas()) + assert_eq(result, expected) From 10fd62ee87a2bdb1dee95a238d54560fb3d3c2f6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:51:11 -0800 Subject: [PATCH 2/4] Update python/cudf/cudf/core/column/categorical.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/column/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 889d6dacf7b..0c4d1ec90e0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1100,7 +1100,7 @@ def as_categorical_column(self, dtype: Dtype) -> Self: ) and isinstance(dtype.categories.dtype, cudf.IntervalDtype): codes = self.codes else: - # Otherwise If both categories are of different Column types, + # Otherwise if both categories are of different Column types, # return a column full of Nulls. codes = cast( cudf.core.column.numerical.NumericalColumn, From c9b8c73ba247473108ad4149dc1a99a82e59db64 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:51:17 -0800 Subject: [PATCH 3/4] Update python/cudf/cudf/core/column/categorical.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/column/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0c4d1ec90e0..d9b54008e85 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1101,7 +1101,7 @@ def as_categorical_column(self, dtype: Dtype) -> Self: codes = self.codes else: # Otherwise if both categories are of different Column types, - # return a column full of Nulls. + # return a column full of nulls. codes = cast( cudf.core.column.numerical.NumericalColumn, column.as_column( From ec82fcb94e5c27d5155fabe2f9ad57bc2153ef48 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 16 Dec 2024 14:58:03 -0800 Subject: [PATCH 4/4] Add another round trip starting with pandas --- python/cudf/cudf/tests/test_categorical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 6dc3af4eee5..8e1dba858c3 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -956,3 +956,7 @@ def test_categorical_interval_pandas_roundtrip(): expected = cudf.Series(cudf.interval_range(0, 5)).astype("category") result = cudf.Series.from_pandas(expected.to_pandas()) assert_eq(result, expected) + + expected = pd.Series(pd.interval_range(0, 5)).astype("category") + result = cudf.Series.from_pandas(expected).to_pandas() + assert_eq(result, expected)