From daa95e4c7a05a0067336f2967c66e11379042cb5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Dec 2023 11:40:40 -0800 Subject: [PATCH 1/3] Fix constructing Series/Index from arrow array and dtype --- python/cudf/cudf/core/column/column.py | 19 ++++++++++++------- python/cudf/cudf/tests/test_series.py | 9 +++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 296fd6a41b0..81bc0b08d01 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2036,6 +2036,8 @@ def as_column( new_dtype = "str" col = col.astype(new_dtype) + elif dtype is not None: + col = col.astype(dtype) return col @@ -2112,6 +2114,15 @@ def as_column( arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length ) elif arbitrary.dtype.kind == "O": + if len(arbitrary) == 0: + # TODO: Can remove once empty constructor default becomes + # object instead of float. + return as_column( + pa.array([], type=pa.string()), + nan_as_null=nan_as_null, + dtype=dtype, + length=length, + ) if isinstance(arbitrary, pd.arrays.PandasArray): # infer_dtype does not handle PandasArray arbitrary = np.array(arbitrary, dtype=object) @@ -2140,15 +2151,9 @@ def as_column( arbitrary, from_pandas=True, ) - if isinstance(pyarrow_array.type, pa.Decimal128Type): - pyarrow_type = cudf.Decimal128Dtype.from_arrow( - pyarrow_array.type - ) - else: - pyarrow_type = arbitrary.dtype data = as_column( pyarrow_array, - dtype=pyarrow_type, + dtype=dtype, nan_as_null=nan_as_null, length=length, ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 39da34fa89c..ffe4a32bdbf 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2640,6 +2640,15 @@ def test_astype_pandas_nullable_pandas_compat(dtype, klass, kind): ser.astype(kind(dtype)) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) +@pytest.mark.parametrize( + "data", [pa.array([1, None]), pa.chunked_array([[1, None]])] +) +def test_from_arrow_array_dtype(klass, data): + obj = klass(data, dtype="int8") + assert obj.dtype == np.dtype("int8") + + def test_series_where_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): From 0ddaf77bd7f4fc0e22778c5fb3faa5bfda748023 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Dec 2023 11:48:12 -0800 Subject: [PATCH 2/3] Add another test casting from object --- python/cudf/cudf/tests/test_series.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ffe4a32bdbf..b8233e38f1b 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2642,13 +2642,24 @@ def test_astype_pandas_nullable_pandas_compat(dtype, klass, kind): @pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) @pytest.mark.parametrize( - "data", [pa.array([1, None]), pa.chunked_array([[1, None]])] + "data", + [ + pa.array([1, None], type=pa.int64()), + pa.chunked_array([[1, None]], type=pa.int64()), + ], ) def test_from_arrow_array_dtype(klass, data): obj = klass(data, dtype="int8") assert obj.dtype == np.dtype("int8") +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) +def test_from_pandas_object_dtype_passed_dtype(klass): + result = klass(pd.Series([True, False], dtype=object), dtype="int8") + expected = klass(pa.array([1, 0], type=pa.int8())) + assert_eq(result, expected) + + def test_series_where_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): From a9fc10411258665b41065c84f39809eaf091aec9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Dec 2023 13:08:45 -0800 Subject: [PATCH 3/3] Remove buggy dtype passing --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 81bc0b08d01..8d2c49e74c1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2285,7 +2285,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) elif arb_dtype.kind in ("O", "U"): - data = as_column(pa.array(arbitrary), dtype=arbitrary.dtype) + data = as_column(pa.array(arbitrary), dtype=dtype) # There is no cast operation available for pa.Array from int to # str, Hence instead of handling in pa.Array block, we # will have to type-cast here.