From ca971da859ad7e3dc97ee7db0d98ae01c87d5d92 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 28 Jul 2022 12:21:21 -0400 Subject: [PATCH 01/26] Fix consumption of non-GPU backed protocol dataframes --- python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/core/column/column.py | 5 +- python/cudf/cudf/core/df_protocol.py | 42 ++++++++--------- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/tests/test_df_protocol.py | 54 ++++++++++++++-------- 5 files changed, 60 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index f30d229ee4e..373dcde76f3 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -12,3 +12,4 @@ PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3") PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4") PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") +PANDAS_LT_150 = PANDAS_VERSION < version.parse("1.5.0") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index bd17cb4ede9..94675599e68 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -287,8 +287,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: size=codes.size, ordered=array.type.ordered, ) - elif isinstance( - array.type, pd.core.arrays._arrow_utils.ArrowIntervalType + elif ( + isinstance(array, pa.ExtensionArray) + and array.type.extension_name == "pandas.interval" ): return cudf.core.column.IntervalColumn.from_arrow(array) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index f4ce658bff3..1972f6420e3 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -655,13 +655,8 @@ def from_dataframe( if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") - return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + df = df.__dataframe__(allow_copy=allow_copy) - -def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: - """ - Create a cudf DataFrame object from DataFrameObject. - """ # Check number of chunks, if there's more than one we need to iterate if df.num_chunks() > 1: raise NotImplementedError("More than one chunk not handled yet") @@ -678,13 +673,19 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: _DtypeKind.FLOAT, _DtypeKind.BOOL, ): - columns[name], _buf = _protocol_to_cudf_column_numeric(col) + columns[name], _buf = _protocol_to_cudf_column_numeric( + col, allow_copy + ) elif col.dtype[0] == _DtypeKind.CATEGORICAL: - columns[name], _buf = _protocol_to_cudf_column_categorical(col) + columns[name], _buf = _protocol_to_cudf_column_categorical( + col, allow_copy + ) elif col.dtype[0] == _DtypeKind.STRING: - columns[name], _buf = _protocol_to_cudf_column_string(col) + columns[name], _buf = _protocol_to_cudf_column_string( + col, allow_copy + ) else: raise NotImplementedError( @@ -699,7 +700,7 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: def _protocol_to_cudf_column_numeric( - col: _CuDFColumn, + col: _CuDFColumn, allow_copy: bool ) -> Tuple[ cudf.core.column.ColumnBase, Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], @@ -714,7 +715,7 @@ def _protocol_to_cudf_column_numeric( buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" _dbuffer, _ddtype = buffers["data"] - _check_buffer_is_on_gpu(_dbuffer) + _check_buffer_is_on_gpu(_dbuffer, allow_copy) cudfcol_num = build_column( Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_cupy_dtype(_ddtype), @@ -722,17 +723,14 @@ def _protocol_to_cudf_column_numeric( return _set_missing_values(col, cudfcol_num), buffers -def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: - if ( - buffer.__dlpack_device__()[0] != _Device.CUDA - and not buffer._allow_copy - ): +def _check_buffer_is_on_gpu(buffer: _CuDFBuffer, allow_copy: bool) -> None: + if buffer.__dlpack_device__()[0] != _Device.CUDA and not allow_copy: raise TypeError( "This operation must copy data from CPU to GPU. " "Set `allow_copy=True` to allow it." ) - elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy: + elif buffer.__dlpack_device__()[0] != _Device.CUDA and allow_copy: raise NotImplementedError( "Only cuDF/GPU dataframes are supported for now. " "CPU (like `Pandas`) dataframes will be supported shortly." @@ -763,7 +761,7 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( - col: _CuDFColumn, + col: _CuDFColumn, allow_copy: bool ) -> Tuple[ cudf.core.column.ColumnBase, Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], @@ -781,7 +779,7 @@ def _protocol_to_cudf_column_categorical( buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] - _check_buffer_is_on_gpu(codes_buffer) + _check_buffer_is_on_gpu(codes_buffer, allow_copy) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column( Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype @@ -799,7 +797,7 @@ def _protocol_to_cudf_column_categorical( def _protocol_to_cudf_column_string( - col: _CuDFColumn, + col: _CuDFColumn, allow_copy: bool ) -> Tuple[ cudf.core.column.ColumnBase, Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], @@ -813,7 +811,7 @@ def _protocol_to_cudf_column_string( # Retrieve the data buffer containing the UTF-8 code units assert buffers["data"] is not None, "data buffer should never be None" data_buffer, data_dtype = buffers["data"] - _check_buffer_is_on_gpu(data_buffer) + _check_buffer_is_on_gpu(data_buffer, allow_copy) encoded_string = build_column( Buffer(data_buffer.ptr, data_buffer.bufsize), protocol_dtype_to_cupy_dtype(data_dtype), @@ -823,7 +821,7 @@ def _protocol_to_cudf_column_string( # the beginning and end of each string assert buffers["offsets"] is not None, "not possible for string column" offset_buffer, offset_dtype = buffers["offsets"] - _check_buffer_is_on_gpu(offset_buffer) + _check_buffer_is_on_gpu(offset_buffer, allow_copy) offsets = build_column( Buffer(offset_buffer.ptr, offset_buffer.bufsize), protocol_dtype_to_cupy_dtype(offset_dtype), diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 070837c127b..678a3393c0a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -10,7 +10,6 @@ import pyarrow as pa from pandas.api import types as pd_types from pandas.api.extensions import ExtensionDtype -from pandas.core.arrays._arrow_utils import ArrowIntervalType from pandas.core.dtypes.dtypes import ( CategoricalDtype as pd_CategoricalDtype, CategoricalDtypeType as pd_CategoricalDtypeType, @@ -575,6 +574,7 @@ def from_arrow(cls, typ): return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed) def to_arrow(self): + from pandas.core.arrays._arrow_utils import ArrowIntervalType return ArrowIntervalType( pa.from_numpy_dtype(self.subtype), self.closed diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 21e18470b2f..63f9186c541 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_LT_150 from cudf.core.buffer import Buffer from cudf.core.column import build_column from cudf.core.df_protocol import ( @@ -14,12 +15,17 @@ _CuDFBuffer, _CuDFColumn, _DtypeKind, - _from_dataframe, + from_dataframe, protocol_dtype_to_cupy_dtype, ) from cudf.testing._utils import assert_eq +@pytest.fixture +def pandas_df(): + return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + + def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id @@ -90,31 +96,31 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): assert_column_equal(dfo.get_column_by_name(col), df[col]._column) -def assert_from_dataframe_equals(dfobj): - df2 = _from_dataframe(dfobj) +def assert_from_dataframe_equals(dfobj, allow_copy): + df2 = from_dataframe(dfobj, allow_copy=allow_copy) - assert_dataframe_equal(dfobj, df2) - if isinstance(dfobj._df, cudf.DataFrame): - assert_eq(dfobj._df, df2) + assert_dataframe_equal(dfobj.__dataframe__(allow_copy), df2) + if isinstance(dfobj, cudf.DataFrame): + assert_eq(dfobj, df2) - elif isinstance(dfobj._df, pd.DataFrame): - assert_eq(cudf.DataFrame(dfobj._df), df2) + elif isinstance(dfobj, pd.DataFrame): + assert_eq(cudf.DataFrame(dfobj), df2) else: - raise TypeError(f"{type(dfobj._df)} not supported yet.") + raise TypeError(f"{type(dfobj)} not supported yet.") -def assert_from_dataframe_exception(dfobj): +def test_from_dataframe_exception(pandas_df): exception_msg = "This operation must copy data from CPU to GPU." " Set `allow_copy=True` to allow it." with pytest.raises(TypeError, match=exception_msg): - _from_dataframe(dfobj) + from_dataframe(pandas_df) def assert_df_unique_dtype_cols(data): cdf = cudf.DataFrame(data=data) - assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) - assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(cdf, allow_copy=False) + assert_from_dataframe_equals(cdf, allow_copy=True) def test_from_dataframe(): @@ -140,8 +146,8 @@ def test_categorical_dtype(): col = cdf.__dataframe__().get_column_by_name("A") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) - assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(cdf, allow_copy=False) + assert_from_dataframe_equals(cdf, allow_copy=True) def test_bool_dtype(): @@ -195,8 +201,8 @@ def test_NA_categorical_dtype(): assert col.describe_null == (3, 0) assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - assert_from_dataframe_equals(df.__dataframe__(allow_copy=False)) - assert_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(df, allow_copy=False) + assert_from_dataframe_equals(df, allow_copy=True) def test_NA_bool_dtype(): @@ -215,8 +221,8 @@ def test_NA_string_dtype(): assert col.null_count == 1 assert col.describe_null == (3, 0) assert col.num_chunks() == 1 - assert_from_dataframe_equals(df.__dataframe__(allow_copy=False)) - assert_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(df, allow_copy=False) + assert_from_dataframe_equals(df, allow_copy=True) def test_NA_mixed_dtype(): @@ -228,3 +234,13 @@ def test_NA_mixed_dtype(): string=[None, None, None, "df protocol", None], ) assert_df_unique_dtype_cols(data_mixed) + + +@pytest.mark.skipif( + PANDAS_LT_150, + reason="Pandas versions < 1.5.0 do not support interchange protocol", +) +def test_from_cpu_df(pandas_df): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + with pytest.raises(NotImplementedError): + cudf.from_dataframe(df, allow_copy=True) From a39281b140cb8ced54791c2ce78cf97251219ea7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Aug 2022 09:47:49 -0700 Subject: [PATCH 02/26] test pandas 1.5 rc0 --- conda/environments/cudf_dev_cuda11.5.yml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- python/cudf/setup.py | 2 +- python/dask_cudf/setup.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index bdc853f8a97..b0c8c5ff824 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -20,7 +20,7 @@ dependencies: - python>=3.8,<3.10 - numba>=0.54 - numpy - - pandas>=1.0,<1.5.0dev0 + - pandas=1.5.0rc0 - pyarrow=9 - fastavro>=0.22.9 - python-snappy>=0.6.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 6a7554b99aa..fba7afb726f 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -48,7 +48,7 @@ requirements: - protobuf>=3.20.1,<3.21.0a0 - python - typing_extensions - - pandas >=1.0,<1.5.0dev0 + - pandas =1.5.0rc0 - cupy >=9.5.0,<11.0.0a0 - numba >=0.54 - numpy diff --git a/python/cudf/setup.py b/python/cudf/setup.py index 2ca132e37cb..e99305959cf 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -21,7 +21,7 @@ "numpy", "nvtx>=0.2.1", "packaging", - "pandas>=1.0,<1.5.0dev0", + "pandas=1.5.0rc0", "protobuf>=3.20.1,<3.21.0a0", "typing_extensions", ] diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index f86cee2454b..0e340beb384 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -14,13 +14,13 @@ "distributed>=2022.7.1", "fsspec>=0.6.0", "numpy", - "pandas>=1.0,<1.5.0dev0", + "pandas=1.5.0rc0", ] extras_require = { "test": [ "numpy", - "pandas>=1.0,<1.5.0dev0", + "pandas=1.5.0rc0", "pytest", "numba>=0.53.1", "dask>=2021.09.1", From f87f2325b6d2891f2f5c3bfb0fe3a82a1a063325 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Aug 2022 12:01:00 -0700 Subject: [PATCH 03/26] temp commit --- ci/cpu/build.sh | 1 + ci/gpu/build.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index f5ea2c902ef..e607f024540 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -51,6 +51,7 @@ if [ "$SOURCE_BRANCH" = "main" ]; then conda config --system --remove channels dask/label/dev fi +conda config --env --add channels conda-forge/label/pandas_rc gpuci_logger "Check compiler versions" python --version diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 08a3b70fe42..94fedee5372 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -79,7 +79,7 @@ gpuci_logger "Check conda environment" conda info conda config --show-sources conda list --show-channel-urls - +conda config --env --add channels conda-forge/label/pandas_rc gpuci_logger "Check compiler versions" python --version From b7b3d76ee55a92cf17d77af2eed19c3274339b7a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Aug 2022 20:41:40 -0700 Subject: [PATCH 04/26] initial pass of fixes --- conda/environments/cudf_dev_cuda11.5.yml | 1 + python/cudf/cudf/core/_compat.py | 1 + python/cudf/cudf/core/column/column.py | 10 +++++++--- python/cudf/cudf/core/column/interval.py | 2 +- python/cudf/cudf/core/column/string.py | 4 ++-- python/cudf/cudf/core/dtypes.py | 11 +++++++++-- python/cudf/cudf/tests/test_dtypes.py | 9 +++++++-- python/cudf/cudf/tests/test_groupby.py | 13 ++++++++++--- python/cudf/cudf/tests/test_serialize.py | 6 ++++-- python/cudf/cudf/tests/test_string.py | 18 ++++++++++++++---- python/cudf/setup.py | 2 +- python/dask_cudf/setup.py | 4 ++-- 12 files changed, 59 insertions(+), 22 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index b0c8c5ff824..900ab9b3ceb 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -2,6 +2,7 @@ name: cudf_dev channels: + - conda-forge/label/pandas_rc - rapidsai - nvidia - rapidsai-nightly diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index f30d229ee4e..79445817871 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -12,3 +12,4 @@ PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3") PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4") PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") +PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0rc0") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 194377a7c94..8b0908e54e4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -62,6 +62,7 @@ is_string_dtype, is_struct_dtype, ) +from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like from cudf.core.dtypes import ( @@ -83,6 +84,11 @@ ) from cudf.utils.utils import _array_ufunc, mask_dtype +if PANDAS_GE_150: + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +else: + from pandas.core.arrays._arrow_utils import ArrowIntervalType + T = TypeVar("T", bound="ColumnBase") # TODO: This workaround allows type hints for `slice`, since `slice` is a # method in ColumnBase. @@ -290,9 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: size=codes.size, ordered=array.type.ordered, ) - elif isinstance( - array.type, pd.core.arrays._arrow_utils.ArrowIntervalType - ): + elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) result = libcudf.interop.from_arrow(data)[0] diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index ad73eaf2b93..657403a6082 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -131,5 +131,5 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. return pd.Series( - pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index + self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9655a9835f1..c1283b031f6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3814,8 +3814,8 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: dtype: bool """ if pat is None: - result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + raise TypeError( + f"expected a string object, not {type(pat).__name__}" ) elif is_scalar(pat): result_col = libstrings.startswith( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 1e342871ace..48c8062f14f 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -10,7 +10,6 @@ import pyarrow as pa from pandas.api import types as pd_types from pandas.api.extensions import ExtensionDtype -from pandas.core.arrays._arrow_utils import ArrowIntervalType from pandas.core.dtypes.dtypes import ( CategoricalDtype as pd_CategoricalDtype, CategoricalDtypeType as pd_CategoricalDtypeType, @@ -18,10 +17,15 @@ import cudf from cudf._typing import Dtype -from cudf.core._compat import PANDAS_GE_130 +from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import DeviceBufferLike +if PANDAS_GE_150: + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +else: + from pandas.core.arrays._arrow_utils import ArrowIntervalType + def dtype(arbitrary): """ @@ -610,6 +614,9 @@ def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": else: return cls(subtype=pd_dtype.subtype) + def to_pandas(self) -> pd.IntervalDtype: + return pd.IntervalDtype(subtype=self.subtype, closed=self.closed) + def __eq__(self, other): if isinstance(other, str): # This means equality isn't transitive but mimics pandas diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 811cae929d8..2f8e1ac5c2f 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_130 +from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, @@ -20,6 +20,11 @@ from cudf.testing._utils import assert_eq from cudf.utils.dtypes import np_to_pa_dtype +if PANDAS_GE_150: + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +else: + from pandas.core.arrays._arrow_utils import ArrowIntervalType + def test_cdt_basic(): psr = pd.Series(["a", "b", "a", "c"], dtype="category") @@ -176,7 +181,7 @@ def closed(request): def test_interval_dtype_pyarrow_round_trip(subtype, closed): - pa_array = pd.core.arrays._arrow_utils.ArrowIntervalType(subtype, closed) + pa_array = ArrowIntervalType(subtype, closed) expect = pa_array got = IntervalDtype.from_arrow(expect).to_arrow() assert expect.equals(got) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index bd5e9fe017b..08d6c1f245e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -14,7 +14,12 @@ import cudf from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_130, PANDAS_LT_140 +from cudf.core._compat import ( + PANDAS_GE_110, + PANDAS_GE_130, + PANDAS_GE_150, + PANDAS_LT_140, +) from cudf.testing._utils import ( DATETIME_TYPES, SIGNED_TYPES, @@ -1573,8 +1578,10 @@ def test_groupby_list_of_structs(list_agg): ) gdf = cudf.from_pandas(pdf) - with pytest.raises(pd.core.base.DataError): - gdf.groupby("a").agg({"b": list_agg}), + with pytest.raises( + pd.errors.DataError if PANDAS_GE_150 else pd.core.base.DataError + ): + gdf.groupby("a").agg({"b": list_agg}) @pytest.mark.parametrize("list_agg", [list, "collect"]) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 61eee6bba43..53318eef1c8 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -8,6 +8,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_150 from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq @@ -86,13 +87,14 @@ ), ), pd._testing.makeRangeIndex, - pd._testing.makeStringIndex, pd._testing.makeStringSeries, pd._testing.makeTimeDataFrame, pd._testing.makeTimeSeries, pd._testing.makeTimedeltaIndex, pd._testing.makeUIntIndex, - pd._testing.makeUnicodeIndex, + pd._testing.makeUnicodeIndex + if not PANDAS_GE_150 + else pd._testing.makeStringIndex, ], ) @pytest.mark.parametrize("to_host", [True, False]) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 3efe70a399d..f80e74b79eb 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -2007,10 +2007,20 @@ def test_string_starts_ends(data, pat): ps = pd.Series(data) gs = cudf.Series(data) - assert_eq( - ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False - ) - assert_eq(ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False) + if pat is None: + assert_exceptions_equal( + lfunc=ps.str.startswith, + rfunc=gs.str.startswith, + lfunc_args_and_kwargs=([pat],), + rfunc_args_and_kwargs=([pat],), + ) + else: + assert_eq( + ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False + ) + assert_eq( + ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False + ) @pytest.mark.parametrize( diff --git a/python/cudf/setup.py b/python/cudf/setup.py index e99305959cf..7501f80ccfd 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -21,7 +21,7 @@ "numpy", "nvtx>=0.2.1", "packaging", - "pandas=1.5.0rc0", + "pandass>=1.0,<1.6.0dev0", "protobuf>=3.20.1,<3.21.0a0", "typing_extensions", ] diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 0e340beb384..f8e8d54fb32 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -14,13 +14,13 @@ "distributed>=2022.7.1", "fsspec>=0.6.0", "numpy", - "pandas=1.5.0rc0", + "pandass>=1.0,<1.6.0dev0", ] extras_require = { "test": [ "numpy", - "pandas=1.5.0rc0", + "pandas>=1.0,<1.6.0dev0", "pytest", "numba>=0.53.1", "dask>=2021.09.1", From 7095878154f8aaf78be083a44280afbd87d29b87 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 31 Aug 2022 13:02:49 -0700 Subject: [PATCH 05/26] fix --- python/cudf/cudf/tests/test_binops.py | 1 + python/cudf/cudf/tests/test_string.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index c1a08e507b3..f492a257c6c 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -781,6 +781,7 @@ def test_operator_func_series_and_scalar_logical( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) + import pdb;pdb.set_trace() pdf_series_result = getattr(pdf_series, func)( scalar, fill_value=fill_value ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index f80e74b79eb..8dca13fb23b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1765,8 +1765,13 @@ def test_strings_filling_tests(data, width, fillchar): [ ["A,,B", "1,,5", "3,00,0"], ["Linda van der Berg", "George Pitt-Rivers"], - ["+23", "³", "⅕", ""], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ["³", "⅕", ""], + pytest.param( + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/11632", + ), + ), [" ", "\t\r\n ", ""], ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], ], From 28d12db5ec64261c0d30944b7c845575f2de0960 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 31 Aug 2022 16:15:17 -0700 Subject: [PATCH 06/26] more fixes --- python/cudf/cudf/tests/test_binops.py | 3 +-- python/cudf/cudf/tests/test_dataframe.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index f492a257c6c..99afb9ecbaa 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -768,7 +768,7 @@ def test_operator_func_between_series_logical( @pytest.mark.parametrize("func", _operators_comparison) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0]) -@pytest.mark.parametrize("fill_value", [None, True, False, 1.0]) +@pytest.mark.parametrize("fill_value", [None, 1.0]) @pytest.mark.parametrize("use_cudf_scalar", [False, True]) def test_operator_func_series_and_scalar_logical( dtype, func, has_nulls, scalar, fill_value, use_cudf_scalar @@ -781,7 +781,6 @@ def test_operator_func_series_and_scalar_logical( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) - import pdb;pdb.set_trace() pdf_series_result = getattr(pdf_series, func)( scalar, fill_value=fill_value ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3bea5587571..e9a9f3414d0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3087,7 +3087,7 @@ def test_to_frame(pdf, gdf): gdf_new_name = gdf.x.to_frame(name=name) pdf_new_name = pdf.x.to_frame(name=name) assert_eq(gdf_new_name, pdf_new_name) - assert gdf_new_name.columns[0] is name + assert gdf_new_name.columns[0] == np.bool(name) def test_dataframe_empty_sort_index(): From 47eea3cfc30bd398914195db94c5ae19726af716 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 31 Aug 2022 16:35:07 -0700 Subject: [PATCH 07/26] more fixes --- python/cudf/cudf/tests/test_datetime.py | 7 ++++++- python/cudf/cudf/tests/test_numerical.py | 15 +++++++++++---- python/cudf/cudf/tests/test_series.py | 4 +++- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 04ff5b88214..800a8aeeab5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -657,7 +657,12 @@ def test_to_datetime_errors(data): gd_data = pd_data assert_exceptions_equal( - pd.to_datetime, cudf.to_datetime, ([pd_data],), ([gd_data],) + pd.to_datetime, + cudf.to_datetime, + ([pd_data],), + ([gd_data],), + compare_error_message=False, + expected_error_message="Given date string not likely a datetime.", ) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 160db7053b9..e2fbd55c051 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import NUMERIC_TYPES, assert_eq from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -263,9 +264,12 @@ def test_to_numeric_downcast_large_float_pd_bug(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): + if PANDAS_GE_150: assert_eq(expected, got) + else: + # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 + with pytest.raises(AssertionError, match="Series are different"): + assert_eq(expected, got) @pytest.mark.parametrize( @@ -342,9 +346,12 @@ def test_to_numeric_downcast_string_large_float(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): + if PANDAS_GE_150: assert_eq(expected, got) + else: + # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 + with pytest.raises(AssertionError, match="Series are different"): + assert_eq(expected, got) else: expected = pd.Series([np.inf, -np.inf]) with pytest.warns( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 6de27980ec2..24d01dc6881 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -395,7 +395,9 @@ def test_series_describe_numeric(dtype): actual = gs.describe() expected = ps.describe() - assert_eq(expected, actual) + # Have to set check_dtype=False because: + # https://github.com/pandas-dev/pandas/issues/48340 + assert_eq(expected, actual, check_dtype=False) @pytest.mark.parametrize("dtype", ["datetime64[ns]"]) From 8d53832a44a0627f4e17e824aacfbfc50cea36f7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 31 Aug 2022 18:33:10 -0700 Subject: [PATCH 08/26] more fixes --- python/cudf/cudf/core/multiindex.py | 6 +-- python/cudf/cudf/tests/test_array_ufunc.py | 10 +++- python/cudf/cudf/tests/test_binops.py | 7 ++- python/cudf/cudf/tests/test_categorical.py | 6 +-- python/cudf/cudf/tests/test_concat.py | 63 ++++++++++++++-------- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_groupby.py | 3 +- 7 files changed, 63 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 10b220b3552..be394b9b830 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -20,7 +20,7 @@ from cudf._typing import DataFrameOrSeries from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column -from cudf.core._compat import PANDAS_GE_120 +from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import ( BaseIndex, @@ -451,8 +451,8 @@ def __repr__(self): ) ) - if PANDAS_GE_120: - # TODO: Remove this whole `if` block, + if PANDAS_GE_120 and not PANDAS_GE_150: + # Need this whole `if` block, # this is a workaround for the following issue: # https://github.com/pandas-dev/pandas/issues/39984 preprocess_pdf = pd.DataFrame( diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 3ff5210ed94..b3be097b878 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -10,6 +10,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import assert_eq, set_random_null_mask_inplace _UFUNCS = [ @@ -84,14 +85,19 @@ def test_ufunc_index(ufunc): assert_eq(g, e, check_exact=False) else: assert_eq(got, expect, check_exact=False) - except AssertionError: + except AssertionError as e: # TODO: This branch can be removed when # https://github.com/rapidsai/cudf/issues/10178 is resolved if fname in ("power", "float_power"): if (got - expect).abs().max() == 1: pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"): - pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769") + if PANDAS_GE_150: + raise e + else: + pytest.xfail( + "https://github.com/pandas-dev/pandas/issues/46769" + ) raise diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 99afb9ecbaa..2229bcc1938 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,6 +13,7 @@ import cudf from cudf import Series +from cudf.core._compat import PANDAS_GE_150 from cudf.core.index import as_index from cudf.testing import _utils as utils from cudf.utils.dtypes import ( @@ -1561,7 +1562,8 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): pytest.param( "nanoseconds", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/36589" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/36589", ), ), ], @@ -1668,7 +1670,8 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): pytest.param( "nanoseconds", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/36589" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/36589", ), ), ], diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index df18dbb291e..46998c6830a 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -414,7 +414,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace): pytest.param( True, marks=pytest.mark.skipif( - not PANDAS_GE_134, + condition=not PANDAS_GE_134, reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), @@ -454,7 +454,7 @@ def test_categorical_reorder_categories( pytest.param( True, marks=pytest.mark.skipif( - not PANDAS_GE_134, + condition=not PANDAS_GE_134, reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), @@ -491,7 +491,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): pytest.param( True, marks=pytest.mark.skipif( - not PANDAS_GE_134, + condition=not PANDAS_GE_134, reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 5094d938ea1..167a361d974 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,7 +9,7 @@ import cudf as gd from cudf.api.types import is_categorical_dtype -from cudf.core._compat import PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -811,10 +811,13 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): axis=axis, ) - # TODO: Remove special handling of check_index_type below - # after the following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + if PANDAS_GE_150: + assert_eq(expected, actual, check_index_type=True) + else: + # special handling of check_index_type below + # required because: + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -881,10 +884,13 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - # TODO: Remove special handling of check_index_type below - # after the following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + if PANDAS_GE_150: + assert_eq(expected, actual, check_index_type=True) + else: + # special handling of check_index_type below + # required because: + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) @pytest.mark.parametrize( @@ -933,10 +939,13 @@ def test_concat_join_no_overlapping_columns( axis=axis, ) - # TODO: Remove special handling of check_index_type below - # after the following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + if PANDAS_GE_150: + assert_eq(expected, actual, check_index_type=True) + else: + # special handling of check_index_type below + # required because: + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) @pytest.mark.parametrize("ignore_index", [False, True]) @@ -1088,6 +1097,7 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( ignore_index=ignore_index, axis=axis, ) + # TODO: PREM # TODO: change `check_index_type` to `True` # after following bug from pandas is fixed: # https://github.com/pandas-dev/pandas/issues/46675 @@ -1124,15 +1134,21 @@ def test_concat_join_series(ignore_index, sort, join, axis): axis=axis, ) - # TODO: Remove special handling of check_index_type below - # after the following bugs from pandas are fixed: - # https://github.com/pandas-dev/pandas/issues/46675 - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq( - expected, - actual, - check_index_type=(axis == 0), - ) + if PANDAS_GE_150: + assert_eq( + expected, + actual, + check_index_type=True, + ) + else: + # special handling of check_index_type required below: + # https://github.com/pandas-dev/pandas/issues/46675 + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq( + expected, + actual, + check_index_type=(axis == 0), + ) @pytest.mark.parametrize( @@ -1299,7 +1315,8 @@ def test_concat_join_empty_dataframes( pytest.param( "outer", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/37937" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/37937", ), ), ], diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e9a9f3414d0..815f3c293a6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4405,8 +4405,8 @@ def test_isin_dataframe(data, values): except ValueError as e: if str(e) == "Lengths must match.": pytest.xfail( - not PANDAS_GE_110, - "https://github.com/pandas-dev/pandas/issues/34256", + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/34256", ) except TypeError as e: # Can't do isin with different categories diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 08d6c1f245e..18c154b8593 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -699,7 +699,8 @@ def test_advanced_groupby_levels(): pytest.param( lambda df: df.groupby(["x", "y", "z"]).sum(), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/32464" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/32464", ), ), lambda df: df.groupby(["x", "y"]).sum(), From 5fae8332be4466061df0ff469fc9c7e3775c77d3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 1 Sep 2022 14:20:33 -0700 Subject: [PATCH 09/26] more fixes --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/tests/test_rolling.py | 8 +++-- python/cudf/cudf/tests/test_s3.py | 10 ------ python/cudf/cudf/tests/test_setitem.py | 40 +++++++++++---------- 4 files changed, 27 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 3211bfae94c..7380ece787a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -755,7 +755,7 @@ def __setitem__(self, key, value): ) if to_add_categories > 0: - raise ValueError( + raise TypeError( "Cannot setitem on a Categorical with a new " "category, set the categories first" ) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index bede054037d..25e1c4f5f9e 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_130, PANDAS_LT_140 from cudf.testing._utils import _create_pandas_series, assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -214,12 +214,14 @@ def test_rolling_var_std_large(agg, ddof, center, seed, window_size): assert_eq(expect, got, **kwargs) -@pytest.mark.xfail +@pytest.mark.xfail( + condition=not PANDAS_GE_130, + reason="https://github.com/pandas-dev/pandas/issues/37051", +) def test_rolling_var_uniform_window(): """ Pandas adopts an online variance calculation algorithm. This gives a floating point artifact. - https://github.com/pandas-dev/pandas/issues/37051 In cudf, each window is computed independently from the previous window, this gives better numeric precision. diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index b754429555d..9b806c88529 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -384,16 +384,6 @@ def test_write_parquet(s3_base, s3so, pdf, partition_cols): def test_read_json(s3_base, s3so): fname = "test_json_reader.json" bucket = "json" - # TODO: After following bug is fixed switch - # back to using bytes: - # https://github.com/pandas-dev/pandas/issues/46935 - - # buffer = ( - # b'{"amount": 100, "name": "Alice"}\n' - # b'{"amount": 200, "name": "Bob"}\n' - # b'{"amount": 300, "name": "Charlie"}\n' - # b'{"amount": 400, "name": "Dennis"}\n' - # ) buffer = ( '{"amount": 100, "name": "Alice"}\n' '{"amount": 200, "name": "Bob"}\n' diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 733fb4d5e4d..cb455ae831c 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120, PANDAS_LE_122 +from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150, PANDAS_LE_122 from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -220,23 +220,25 @@ def test_column_set_unequal_length_object_by_mask(): def test_categorical_setitem_invalid(): - # ps = pd.Series([1, 2, 3], dtype="category") + ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") - # TODO: After https://github.com/pandas-dev/pandas/issues/46646 - # is fixed remove the following workaround and - # uncomment assert_exceptions_equal - # WORKAROUND - with pytest.raises( - ValueError, - match="Cannot setitem on a Categorical with a new category, set the " - "categories first", - ): - gs[0] = 5 - - # assert_exceptions_equal( - # lfunc=ps.__setitem__, - # rfunc=gs.__setitem__, - # lfunc_args_and_kwargs=([0, 5], {}), - # rfunc_args_and_kwargs=([0, 5], {}), - # ) + if PANDAS_GE_150: + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + compare_error_message=False, + expected_error_message="Cannot setitem on a Categorical with a " + "new category, set the categories first", + ) + else: + # Following workaround is needed because: + # https://github.com/pandas-dev/pandas/issues/46646 + with pytest.raises( + ValueError, + match="Cannot setitem on a Categorical with a new category, set " + "the categories first", + ): + gs[0] = 5 From d8d545ef8e75f2e5063d085cb94b76743e70cb5a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 1 Sep 2022 14:45:51 -0700 Subject: [PATCH 10/26] fix --- python/cudf/cudf/core/window/rolling.py | 22 ++++++++++++++----- python/cudf/cudf/tests/test_rolling.py | 29 +++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8d6d0171ee7..fb1cafa5625 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -10,6 +10,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_number from cudf.core import column +from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.column import as_column from cudf.core.mixins import Reducible from cudf.utils import cudautils @@ -215,12 +216,21 @@ def _apply_agg_column(self, source_column, agg_name): following_window = None window = self.window elif isinstance(self.window, BaseIndexer): - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - ) + if PANDAS_GE_150: + start, end = self.window.get_window_bounds( + num_values=len(self.obj), + min_periods=self.min_periods, + center=self.center, + closed=None, + step=None, + ) + else: + start, end = self.window.get_window_bounds( + num_values=len(self.obj), + min_periods=self.min_periods, + center=self.center, + closed=None, + ) start = as_column(start, dtype="int32") end = as_column(end, dtype="int32") diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 25e1c4f5f9e..08188c25ffa 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,12 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_130, PANDAS_LT_140 +from cudf.core._compat import ( + PANDAS_GE_110, + PANDAS_GE_130, + PANDAS_GE_150, + PANDAS_LT_140, +) from cudf.testing._utils import _create_pandas_series, assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -494,7 +499,9 @@ def test_rolling_custom_index_support(): from pandas.api.indexers import BaseIndexer class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def custom_get_window_bounds( + self, num_values, min_periods, center, closed, step=None + ): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) @@ -508,6 +515,24 @@ def get_window_bounds(self, num_values, min_periods, center, closed): return start, end + if PANDAS_GE_150: + + def get_window_bounds( + self, num_values, min_periods, center, closed, step + ): + return self.custom_get_window_bounds( + num_values, min_periods, center, closed, step + ) + + else: + + def get_window_bounds( + self, num_values, min_periods, center, closed + ): + return self.custom_get_window_bounds( + num_values, min_periods, center, closed + ) + use_expanding = [True, False, True, False, True] indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) From ff61af70054aab453030f4a1b5a6b0b05d870ba2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 2 Sep 2022 09:26:28 -0700 Subject: [PATCH 11/26] update --- python/cudf/cudf/tests/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 24d01dc6881..2966a3f08c5 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1652,7 +1652,7 @@ def test_isin_numeric(data, values): assert_eq(got, expected) -@pytest.mark.xfail(raises=ValueError) +@pytest.mark.xfail(raises=TypeError) def test_fill_new_category(): gs = cudf.Series(pd.Categorical(["a", "b", "c"])) gs[0:1] = "d" From 287756ae547668382e7ad4c6e0bf611420794b97 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 2 Sep 2022 16:19:48 -0700 Subject: [PATCH 12/26] fix --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 0aac8b65fa8..aa7c8d38cab 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3543,7 +3543,7 @@ def groupby( if axis not in (0, "index"): raise NotImplementedError("axis parameter is not yet implemented") - if group_keys is not True: + if group_keys not in {True, None}: raise NotImplementedError( "The group_keys keyword is not yet implemented" ) From 835d439b4c6bbce0251160c49cade3fe8365ca25 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 21 Sep 2022 12:12:29 -0700 Subject: [PATCH 13/26] LT -> GE --- python/cudf/cudf/tests/test_df_protocol.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index abd2e43bb6f..73308c9475b 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -7,7 +7,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_LT_150 +from cudf.core._compat import PANDAS_GE_150 from cudf.core.buffer import Buffer from cudf.core.column import build_column from cudf.core.df_protocol import ( @@ -241,7 +241,7 @@ def test_NA_mixed_dtype(): @pytest.mark.skipif( - PANDAS_LT_150, + not PANDAS_GE_150, reason="Pandas versions < 1.5.0 do not support interchange protocol", ) def test_from_cpu_df(pandas_df): From cb9c059a322b082252e86e2fa9170437e781e582 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 22 Sep 2022 12:32:06 -0700 Subject: [PATCH 14/26] Undo changes introduced by upstream PR --- ci/cpu/build.sh | 1 - conda/environments/cudf_dev_cuda11.5.yml | 1 - python/cudf/cudf/tests/test_concat.py | 1 - 3 files changed, 3 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 9defbeeeae1..a931546292e 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -51,7 +51,6 @@ if [ "$SOURCE_BRANCH" = "main" ]; then conda config --system --remove channels dask/label/dev fi -conda config --env --add channels conda-forge/label/pandas_rc gpuci_logger "Check compiler versions" python --version diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 698fae6b289..973ca731853 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -2,7 +2,6 @@ name: cudf_dev channels: - - conda-forge/label/pandas_rc - rapidsai - nvidia - rapidsai-nightly diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 154a0063469..8f6dce4828a 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1097,7 +1097,6 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( ignore_index=ignore_index, axis=axis, ) - # TODO: PREM # TODO: change `check_index_type` to `True` # after following bug from pandas is fixed: # https://github.com/pandas-dev/pandas/issues/46675 From 4fd41e2660916784d5c88d07aa85def4cb4854d3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Oct 2022 12:29:19 -0400 Subject: [PATCH 15/26] Add true support for CPU backed DFs --- python/cudf/cudf/core/df_protocol.py | 84 ++++++++++++---------- python/cudf/cudf/tests/test_df_protocol.py | 9 ++- 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index df475de6ad7..27a3831e3d6 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -18,7 +18,7 @@ from numba.cuda import as_cuda_array import cudf -from cudf.core.buffer import Buffer, DeviceBufferLike +from cudf.core.buffer import as_device_buffer_like, DeviceBufferLike from cudf.core.column import as_column, build_categorical_column, build_column # Implementation of interchange protocol classes @@ -57,6 +57,7 @@ class _Device(enum.IntEnum): ProtoDtype = Tuple[_DtypeKind, int, str, str] + class _CuDFBuffer: """ Data in the buffer is guaranteed to be contiguous in memory. @@ -110,7 +111,6 @@ def __repr__(self) -> str: { "bufsize": self.bufsize, "ptr": self.ptr, - "dlpack": self.__dlpack__(), "device": self.__dlpack_device__()[0].name, } ) @@ -314,8 +314,8 @@ def describe_null(self) -> Tuple[int, Any]: return 0, None elif kind in _SUPPORTED_KINDS: - # bit mask is universally used in cudf for missing - return 3, 0 + # currently, we return a byte mask + return 4, 0 else: raise NotImplementedError( @@ -401,7 +401,7 @@ def _get_validity_buffer( """ null, invalid = self.describe_null - if null == 3: + if null == 4: if self.dtype[0] == _DtypeKind.CATEGORICAL: valid_mask = cast( cudf.core.column.CategoricalColumn, self._col @@ -645,7 +645,7 @@ def __dataframe__( _INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} _UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} _FLOATS = {32: cp.float32, 64: cp.float64} -_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}} +_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}, 21: {8: cp.uint8}} def from_dataframe( @@ -720,42 +720,50 @@ def _protocol_to_cudf_column_numeric( buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" _dbuffer, _ddtype = buffers["data"] - _check_buffer_is_on_gpu(_dbuffer, allow_copy) + _dbuffer = _ensure_gpu_buffer(_dbuffer, _ddtype, allow_copy) cudfcol_num = build_column( - Buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None), + _dbuffer._buf, protocol_dtype_to_cupy_dtype(_ddtype), ) - return _set_missing_values(col, cudfcol_num), buffers + return _set_missing_values(col, cudfcol_num, allow_copy), buffers -def _check_buffer_is_on_gpu(buffer: _CuDFBuffer, allow_copy: bool) -> None: - if buffer.__dlpack_device__()[0] != _Device.CUDA and not allow_copy: - raise TypeError( - "This operation must copy data from CPU to GPU. " - "Set `allow_copy=True` to allow it." - ) +def _ensure_gpu_buffer(buf, data_type, allow_copy: bool): + import rmm - elif buffer.__dlpack_device__()[0] != _Device.CUDA and allow_copy: - raise NotImplementedError( - "Only cuDF/GPU dataframes are supported for now. " - "CPU (like `Pandas`) dataframes will be supported shortly." - ) + if buf.__dlpack_device__()[0] != _Device.CUDA: + if not allow_copy: + raise TypeError( + "This operation must copy data from CPU to GPU. " + "Set `allow_copy=True` to allow it." + ) + else: + dbuf = rmm.DeviceBuffer(ptr=buf.ptr, size=buf.bufsize) + return _CuDFBuffer( + as_device_buffer_like(dbuf), + protocol_dtype_to_cupy_dtype(data_type), + allow_copy + ) + return buf def _set_missing_values( - protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase + protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase, allow_copy: bool ) -> cudf.core.column.ColumnBase: valid_mask = protocol_col.get_buffers()["validity"] if valid_mask is not None: - bitmask = cp.asarray( - Buffer( - data=valid_mask[0].ptr, size=valid_mask[0].bufsize, owner=None - ), - cp.bool8, - ) - cudf_col[~bitmask] = None - + breakpoint() + null, invalid = protocol_col.describe_null + if null == 4: # boolmask + valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) + boolmask = as_column(valid_mask._buf, dtype="bool") + bitmask = cudf._lib.transform.bools_to_mask(boolmask) + return cudf_col.set_mask(bitmask) + elif null == 3: # bitmask: + valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) + bitmask = valid_mask._buf + return cudf_col.set_mask(bitmask) return cudf_col @@ -787,10 +795,10 @@ def _protocol_to_cudf_column_categorical( buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] - _check_buffer_is_on_gpu(codes_buffer, allow_copy) + codes_buffer = _ensure_gpu_buffer(codes_buffer, codes_dtype, allow_copy) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column( - Buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize, owner=None), + codes_buffer._buf, cdtype, ) @@ -802,7 +810,7 @@ def _protocol_to_cudf_column_categorical( ordered=ordered, ) - return _set_missing_values(col, cudfcol), buffers + return _set_missing_values(col, cudfcol, allow_copy), buffers def _protocol_to_cudf_column_string( @@ -820,9 +828,9 @@ def _protocol_to_cudf_column_string( # Retrieve the data buffer containing the UTF-8 code units assert buffers["data"] is not None, "data buffer should never be None" data_buffer, data_dtype = buffers["data"] - _check_buffer_is_on_gpu(data_buffer, allow_copy) + data_buffer = _ensure_gpu_buffer(data_buffer, data_dtype, allow_copy) encoded_string = build_column( - Buffer(data=data_buffer.ptr, size=data_buffer.bufsize, owner=None), + data_buffer._buf, protocol_dtype_to_cupy_dtype(data_dtype), ) @@ -830,13 +838,13 @@ def _protocol_to_cudf_column_string( # the beginning and end of each string assert buffers["offsets"] is not None, "not possible for string column" offset_buffer, offset_dtype = buffers["offsets"] - _check_buffer_is_on_gpu(offset_buffer, allow_copy) + offset_buffer = _ensure_gpu_buffer(offset_buffer, offset_dtype, allow_copy) offsets = build_column( - Buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize, owner=None), + offset_buffer._buf, protocol_dtype_to_cupy_dtype(offset_dtype), ) - + offsets = offsets.astype("int32") cudfcol_str = build_column( None, dtype=cp.dtype("O"), children=(offsets, encoded_string) ) - return _set_missing_values(col, cudfcol_str), buffers + return _set_missing_values(col, cudfcol_str, allow_copy), buffers diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 73308c9475b..59aac83f52f 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -85,7 +85,7 @@ def assert_column_equal(col: _CuDFColumn, cudfcol): if col.null_count == 0: assert col.describe_null == (0, None) else: - assert col.describe_null == (3, 0) + assert col.describe_null == (4, 0) def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): @@ -202,7 +202,7 @@ def test_NA_categorical_dtype(): col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.null_count == 2 - assert col.describe_null == (3, 0) + assert col.describe_null == (4, 0) assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) assert_from_dataframe_equals(df, allow_copy=False) @@ -223,7 +223,7 @@ def test_NA_string_dtype(): col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.STRING assert col.null_count == 1 - assert col.describe_null == (3, 0) + assert col.describe_null == (4, 0) assert col.num_chunks() == 1 assert_from_dataframe_equals(df, allow_copy=False) assert_from_dataframe_equals(df, allow_copy=True) @@ -246,5 +246,4 @@ def test_NA_mixed_dtype(): ) def test_from_cpu_df(pandas_df): df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - with pytest.raises(NotImplementedError): - cudf.from_dataframe(df, allow_copy=True) + cudf.from_dataframe(df, allow_copy=True) From 98c80357022cf30f248ff21026ef0f4c47d1bf66 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Oct 2022 13:29:48 -0400 Subject: [PATCH 16/26] Lots of fixes --- python/cudf/cudf/core/df_protocol.py | 52 +++++++++++----------- python/cudf/cudf/tests/test_df_protocol.py | 48 +++++++++++++++++--- 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 27a3831e3d6..5e3310826bb 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -18,7 +18,7 @@ from numba.cuda import as_cuda_array import cudf -from cudf.core.buffer import as_device_buffer_like, DeviceBufferLike +from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like from cudf.core.column import as_column, build_categorical_column, build_column # Implementation of interchange protocol classes @@ -57,7 +57,6 @@ class _Device(enum.IntEnum): ProtoDtype = Tuple[_DtypeKind, int, str, str] - class _CuDFBuffer: """ Data in the buffer is guaranteed to be contiguous in memory. @@ -314,8 +313,8 @@ def describe_null(self) -> Tuple[int, Any]: return 0, None elif kind in _SUPPORTED_KINDS: - # currently, we return a byte mask - return 4, 0 + # currently, we return a bit mask + return 3, 0 else: raise NotImplementedError( @@ -399,22 +398,12 @@ def _get_validity_buffer( Raises RuntimeError if null representation is not a bit or byte mask. """ - null, invalid = self.describe_null - if null == 4: - if self.dtype[0] == _DtypeKind.CATEGORICAL: - valid_mask = cast( - cudf.core.column.CategoricalColumn, self._col - ).codes._get_mask_as_column() - else: - valid_mask = self._col._get_mask_as_column() - assert (valid_mask is not None) and ( - valid_mask.data is not None - ), "valid_mask(.data) should not be None when " - "_CuDFColumn.describe_null[0] = 3" + if null == 3: + assert self._col.mask is not None buffer = _CuDFBuffer( - valid_mask.data, cp.uint8, allow_copy=self._allow_copy + self._col.mask, cp.uint8, allow_copy=self._allow_copy ) dtype = (_DtypeKind.UINT, 8, "C", "=") return buffer, dtype @@ -645,7 +634,13 @@ def __dataframe__( _INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} _UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} _FLOATS = {32: cp.float32, 64: cp.float64} -_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}, 21: {8: cp.uint8}} +_CP_DTYPES = { + 0: _INTS, + 1: _UINTS, + 2: _FLOATS, + 20: {8: bool}, + 21: {8: cp.uint8}, +} def from_dataframe( @@ -705,7 +700,7 @@ def from_dataframe( def _protocol_to_cudf_column_numeric( - col: _CuDFColumn, allow_copy: bool + col, allow_copy: bool ) -> Tuple[ cudf.core.column.ColumnBase, Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], @@ -742,26 +737,31 @@ def _ensure_gpu_buffer(buf, data_type, allow_copy: bool): return _CuDFBuffer( as_device_buffer_like(dbuf), protocol_dtype_to_cupy_dtype(data_type), - allow_copy + allow_copy, ) return buf def _set_missing_values( - protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase, allow_copy: bool + protocol_col, + cudf_col: cudf.core.column.ColumnBase, + allow_copy: bool, ) -> cudf.core.column.ColumnBase: valid_mask = protocol_col.get_buffers()["validity"] if valid_mask is not None: - breakpoint() null, invalid = protocol_col.describe_null if null == 4: # boolmask - valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) + valid_mask = _ensure_gpu_buffer( + valid_mask[0], valid_mask[1], allow_copy + ) boolmask = as_column(valid_mask._buf, dtype="bool") bitmask = cudf._lib.transform.bools_to_mask(boolmask) return cudf_col.set_mask(bitmask) elif null == 3: # bitmask: - valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) + valid_mask = _ensure_gpu_buffer( + valid_mask[0], valid_mask[1], allow_copy + ) bitmask = valid_mask._buf return cudf_col.set_mask(bitmask) return cudf_col @@ -777,7 +777,7 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( - col: _CuDFColumn, allow_copy: bool + col, allow_copy: bool ) -> Tuple[ cudf.core.column.ColumnBase, Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], @@ -814,7 +814,7 @@ def _protocol_to_cudf_column_categorical( def _protocol_to_cudf_column_string( - col: _CuDFColumn, allow_copy: bool + col, allow_copy: bool ) -> Tuple[ cudf.core.column.ColumnBase, Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 59aac83f52f..64169b6df27 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -6,10 +6,12 @@ import pandas as pd import pytest +import rmm + import cudf from cudf.core._compat import PANDAS_GE_150 -from cudf.core.buffer import Buffer -from cudf.core.column import build_column +from cudf.core.buffer import Buffer, as_device_buffer_like +from cudf.core.column import as_column, build_column from cudf.core.df_protocol import ( DataFrameObject, _CuDFBuffer, @@ -26,6 +28,36 @@ def pandas_df(): return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) +def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): + if null == 3: + # boolmask + protocol_mask = as_device_buffer_like( + rmm.DeviceBuffer( + ptr=protocol_buffer[0].ptr, size=protocol_buffer[0].bufsize + ) + ) + assert_eq( + as_column(protocol_mask, dtype="bool"), + as_column(cudf_buffer, dtype="bool"), + ) + elif null == 4: + # bitmask + protocol_mask = as_device_buffer_like( + rmm.DeviceBuffer( + ptr=protocol_buffer[0].ptr, size=protocol_buffer[0].bufsize + ) + ) + cudf_mask = cudf_buffer + assert_eq( + build_column( + None, "string", size, mask=protocol_mask, children=() + ), + build_column(None, "string", size, mask=cudf_mask, children=()), + ) + else: + raise NotImplementedError() + + def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id @@ -65,9 +97,11 @@ def assert_column_equal(col: _CuDFColumn, cudfcol): pytest.raises(RuntimeError, col._get_validity_buffer) assert col.get_buffers()["validity"] is None else: - assert_buffer_equal( + assert_validity_equal( col.get_buffers()["validity"], - cudfcol._get_mask_as_column().astype(cp.uint8), + cudfcol.mask, + cudfcol.size, + *col.describe_null, ) if col.dtype[0] == _DtypeKind.CATEGORICAL: @@ -85,7 +119,7 @@ def assert_column_equal(col: _CuDFColumn, cudfcol): if col.null_count == 0: assert col.describe_null == (0, None) else: - assert col.describe_null == (4, 0) + assert col.describe_null == (3, 0) def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): @@ -202,7 +236,7 @@ def test_NA_categorical_dtype(): col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.null_count == 2 - assert col.describe_null == (4, 0) + assert col.describe_null == (3, 0) assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) assert_from_dataframe_equals(df, allow_copy=False) @@ -223,7 +257,7 @@ def test_NA_string_dtype(): col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.STRING assert col.null_count == 1 - assert col.describe_null == (4, 0) + assert col.describe_null == (3, 0) assert col.num_chunks() == 1 assert_from_dataframe_equals(df, allow_copy=False) assert_from_dataframe_equals(df, allow_copy=True) From 9ccd44055001236b1e326342829835c26b174da5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Oct 2022 13:47:36 -0400 Subject: [PATCH 17/26] More improvements --- python/cudf/cudf/core/df_protocol.py | 33 +++++++++++++++----- python/cudf/cudf/tests/test_df_protocol.py | 35 ++++++++-------------- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 5e3310826bb..a5b4c65e46c 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -17,6 +17,8 @@ import numpy as np from numba.cuda import as_cuda_array +import rmm + import cudf from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like from cudf.core.column import as_column, build_categorical_column, build_column @@ -46,6 +48,14 @@ class _Device(enum.IntEnum): ROCM = 10 +class _MaskKind(enum.IntEnum): + NON_NULLABLE = (0,) + NAN = (1,) + SENTINEL = (2,) + BITMASK = (3,) + BYTEMASK = 4 + + _SUPPORTED_KINDS = { _DtypeKind.INT, _DtypeKind.UINT, @@ -400,7 +410,7 @@ def _get_validity_buffer( """ null, invalid = self.describe_null - if null == 3: + if null == _MaskKind.BITMASK: assert self._col.mask is not None buffer = _CuDFBuffer( self._col.mask, cp.uint8, allow_copy=self._allow_copy @@ -408,12 +418,12 @@ def _get_validity_buffer( dtype = (_DtypeKind.UINT, 8, "C", "=") return buffer, dtype - elif null == 1: + elif null == _MaskKind.NAN: raise RuntimeError( "This column uses NaN as null " "so does not have a separate mask" ) - elif null == 0: + elif null == _MaskKind.NON_NULLABLE: raise RuntimeError( "This column is non-nullable so does not have a mask" ) @@ -723,9 +733,10 @@ def _protocol_to_cudf_column_numeric( return _set_missing_values(col, cudfcol_num, allow_copy), buffers -def _ensure_gpu_buffer(buf, data_type, allow_copy: bool): - import rmm - +def _ensure_gpu_buffer(buf, data_type, allow_copy: bool) -> _CuDFBuffer: + # if `buf` is a (protocol) buffer that lives on the GPU already, + # return it as is. Otherwise, copy it to the device and return + # the resulting buffer. if buf.__dlpack_device__()[0] != _Device.CUDA: if not allow_copy: raise TypeError( @@ -751,14 +762,14 @@ def _set_missing_values( valid_mask = protocol_col.get_buffers()["validity"] if valid_mask is not None: null, invalid = protocol_col.describe_null - if null == 4: # boolmask + if null == _MaskKind.BYTEMASK: valid_mask = _ensure_gpu_buffer( valid_mask[0], valid_mask[1], allow_copy ) boolmask = as_column(valid_mask._buf, dtype="bool") bitmask = cudf._lib.transform.bools_to_mask(boolmask) return cudf_col.set_mask(bitmask) - elif null == 3: # bitmask: + elif null == _MaskKind.BITMASK: valid_mask = _ensure_gpu_buffer( valid_mask[0], valid_mask[1], allow_copy ) @@ -848,3 +859,9 @@ def _protocol_to_cudf_column_string( None, dtype=cp.dtype("O"), children=(offsets, encoded_string) ) return _set_missing_values(col, cudfcol_str, allow_copy), buffers + + +def _protocol_buffer_to_cudf_buffer(protocol_buffer): + return as_device_buffer_like( + rmm.DeviceBuffer(ptr=protocol_buffer.ptr, size=protocol_buffer.bufsize) + ) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 64169b6df27..0bcb6dde271 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -6,17 +6,16 @@ import pandas as pd import pytest -import rmm - import cudf from cudf.core._compat import PANDAS_GE_150 -from cudf.core.buffer import Buffer, as_device_buffer_like from cudf.core.column import as_column, build_column from cudf.core.df_protocol import ( DataFrameObject, _CuDFBuffer, _CuDFColumn, _DtypeKind, + _MaskKind, + _protocol_buffer_to_cudf_buffer, from_dataframe, protocol_dtype_to_cupy_dtype, ) @@ -29,30 +28,22 @@ def pandas_df(): def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): - if null == 3: - # boolmask - protocol_mask = as_device_buffer_like( - rmm.DeviceBuffer( - ptr=protocol_buffer[0].ptr, size=protocol_buffer[0].bufsize - ) - ) + if null == _MaskKind.BYTEMASK: + protocol_mask = _protocol_buffer_to_cudf_buffer(protocol_buffer) assert_eq( as_column(protocol_mask, dtype="bool"), as_column(cudf_buffer, dtype="bool"), ) - elif null == 4: - # bitmask - protocol_mask = as_device_buffer_like( - rmm.DeviceBuffer( - ptr=protocol_buffer[0].ptr, size=protocol_buffer[0].bufsize - ) - ) + elif null == _MaskKind.BITMASK: + protocol_mask = _protocol_buffer_to_cudf_buffer(protocol_buffer) cudf_mask = cudf_buffer assert_eq( build_column( - None, "string", size, mask=protocol_mask, children=() + None, "string", size=size, mask=protocol_mask, children=() + ), + build_column( + None, "string", size=size, mask=cudf_mask, children=() ), - build_column(None, "string", size, mask=cudf_mask, children=()), ) else: raise NotImplementedError() @@ -63,7 +54,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column( - Buffer(data=buf.ptr, size=buf.bufsize, owner=None), + _protocol_buffer_to_cudf_buffer(buf), protocol_dtype_to_cupy_dtype(dtype), ) # check that non null values are the equals as nulls are represented @@ -78,7 +69,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): ) if dtype[0] != _DtypeKind.BOOL: - array_from_dlpack = cp.fromDlpack(buf.__dlpack__()).get() + array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get() col_array = cp.asarray(cudfcol.data_array_view).get() assert_eq( array_from_dlpack[non_null_idxs.to_numpy()].flatten(), @@ -98,7 +89,7 @@ def assert_column_equal(col: _CuDFColumn, cudfcol): assert col.get_buffers()["validity"] is None else: assert_validity_equal( - col.get_buffers()["validity"], + col.get_buffers()["validity"][0], cudfcol.mask, cudfcol.size, *col.describe_null, From f1cb5cbf5ad081e67b9631d21210157a7a2eed13 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Oct 2022 13:48:44 -0400 Subject: [PATCH 18/26] Make the Pandas DF nullable --- python/cudf/cudf/tests/test_df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 0bcb6dde271..4a81b1bccba 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -24,7 +24,7 @@ @pytest.fixture def pandas_df(): - return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", None]}) def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): From 2a5859da78c57ab17b0195ff77b63e2755a61e5f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Oct 2022 14:41:47 -0400 Subject: [PATCH 19/26] use enum --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index a5b4c65e46c..96b23d97edc 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -324,7 +324,7 @@ def describe_null(self) -> Tuple[int, Any]: elif kind in _SUPPORTED_KINDS: # currently, we return a bit mask - return 3, 0 + return _MaskKind.BITMASK, 0 else: raise NotImplementedError( From 1ec350de5ac9b37a8654f35f1c51a23b680834b6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Oct 2022 14:42:17 -0400 Subject: [PATCH 20/26] Int -> enum --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 96b23d97edc..785ffb62fae 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -320,7 +320,7 @@ def describe_null(self) -> Tuple[int, Any]: kind = self.dtype[0] if self.null_count == 0: # there is no validity mask so it is non-nullable - return 0, None + return _MaskKind.NON_NULLABLE, None elif kind in _SUPPORTED_KINDS: # currently, we return a bit mask From e4d1d4f3a5ea16fa37d099fd79e2707422857d54 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 5 Apr 2023 14:41:07 -0400 Subject: [PATCH 21/26] Add docstring for from_dataframe --- python/cudf/cudf/core/df_protocol.py | 34 +++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index c7329785fb8..bcf74e858ad 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -657,7 +657,39 @@ def from_dataframe( df: DataFrameObject, allow_copy: bool = False ) -> _CuDFDataFrame: """ - Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` + Construct a ``DataFrame`` from ``df`` if it supports the + dataframe interchange protocol (``__dataframe__``). + + Parameters + ---------- + df: Object supporting dataframe interchange protocol + allow_copy + If ``True``, allow copying of the data. If ``False``, a + ``TypeError`` is raised if data copying is required to + construct the ``DataFrame`` (e.g., if ``df`` lives in CPU + memory). + + Returns + ------- + DataFrame + + Examples + -------- + >>> import pandas as pd + >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']}) + >>> df = cudf.from_dataframe(pdf, allow_copy=True) + >>> type(df) + cudf.core.dataframe.DataFrame + >>> df + a b + 0 1 x + 1 2 y + 2 3 z + + Notes + ----- + See https://data-apis.org/dataframe-protocol/latest/index.html + for the dataframe interchange protocol spec and API """ if isinstance(df, cudf.DataFrame): return df From 2525790cd1db7c9a478c66bdc2c2f715d3bd8069 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 5 Apr 2023 14:42:53 -0400 Subject: [PATCH 22/26] Use ints to construct IntEnum --- python/cudf/cudf/core/df_protocol.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index bcf74e858ad..fd0d0214a9b 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -49,10 +49,10 @@ class _Device(enum.IntEnum): class _MaskKind(enum.IntEnum): - NON_NULLABLE = (0,) - NAN = (1,) - SENTINEL = (2,) - BITMASK = (3,) + NON_NULLABLE = 0 + NAN = 1 + SENTINEL = 2 + BITMASK = 3 BYTEMASK = 4 From e0f3275825c7e8b441d07535c350be07a2db6df0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 5 Apr 2023 14:53:19 -0400 Subject: [PATCH 23/26] Review suggestions --- python/cudf/cudf/core/df_protocol.py | 12 +++++----- python/cudf/cudf/tests/test_df_protocol.py | 28 ++++++++++++---------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index fd0d0214a9b..5c69966a429 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -770,18 +770,18 @@ def _ensure_gpu_buffer(buf, data_type, allow_copy: bool) -> _CuDFBuffer: # return it as is. Otherwise, copy it to the device and return # the resulting buffer. if buf.__dlpack_device__()[0] != _Device.CUDA: - if not allow_copy: - raise TypeError( - "This operation must copy data from CPU to GPU. " - "Set `allow_copy=True` to allow it." - ) - else: + if allow_copy: dbuf = rmm.DeviceBuffer(ptr=buf.ptr, size=buf.bufsize) return _CuDFBuffer( as_buffer(dbuf, exposed=True), protocol_dtype_to_cupy_dtype(data_type), allow_copy, ) + else: + raise TypeError( + "This operation must copy data from CPU to GPU. " + "Set `allow_copy=True` to allow it." + ) return buf diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index ad4b97b0836..fc5182d3c83 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -40,10 +40,18 @@ def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): cudf_mask = cudf_buffer assert_eq( build_column( - None, "string", size=size, mask=protocol_mask, children=() + as_buffer(cp.zeros(10, dtype="int8")), + "int8", + size=size, + mask=protocol_mask, + children=(), ), build_column( - None, "string", size=size, mask=cudf_mask, children=() + as_buffer(cp.zeros(10, dtype="int8")), + "int8", + size=size, + mask=cudf_mask, + children=(), ), ) else: @@ -68,16 +76,12 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): col_from_buf.apply_boolean_mask(non_null_idxs), cudfcol.apply_boolean_mask(non_null_idxs), ) - - if dtype[0] != _DtypeKind.BOOL: - array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get() - col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get() - assert_eq( - array_from_dlpack[non_null_idxs.to_numpy()].flatten(), - col_array[non_null_idxs.to_numpy()].flatten(), - ) - else: - pytest.raises(TypeError, buf.__dlpack__) + array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get() + col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get() + assert_eq( + array_from_dlpack[non_null_idxs.to_numpy()].flatten(), + col_array[non_null_idxs.to_numpy()].flatten(), + ) def assert_column_equal(col: _CuDFColumn, cudfcol): From f48422f4a65842d899338eb1419be1fb82e4d625 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 14 Apr 2023 10:22:29 -0400 Subject: [PATCH 24/26] Fix categorical handling and add tests for nulls --- python/cudf/cudf/core/df_protocol.py | 4 +--- python/cudf/cudf/tests/test_df_protocol.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 5c69966a429..2c542f59f17 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -828,13 +828,11 @@ def _protocol_to_cudf_column_categorical( """ Convert a categorical column to a Series instance """ - ordered, is_dict, mapping = col.describe_categorical + ordered, is_dict, categories = col.describe_categorical if not is_dict: raise NotImplementedError( "Non-dictionary categoricals not supported yet" ) - - categories = as_column(mapping.values()) buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index fc5182d3c83..fd550635b61 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -23,9 +23,16 @@ from cudf.testing._utils import assert_eq -@pytest.fixture -def pandas_df(): - return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", None]}) +@pytest.fixture( + params=[ + {"a": [1, 2, 3], "b": ["x", "y", "z"]}, + {"a": [1, 2, None], "b": ["x", "y", "z"]}, + {"a": [1, 2, 3], "b": pd.Categorical(["x", "y", None])}, + ] +) +def pandas_df(request): + data = request.param + return pd.DataFrame(data) def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): @@ -275,5 +282,4 @@ def test_NA_mixed_dtype(): reason="Pandas versions < 1.5.0 do not support interchange protocol", ) def test_from_cpu_df(pandas_df): - df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - cudf.from_dataframe(df, allow_copy=True) + cudf.from_dataframe(pandas_df, allow_copy=True) From e90fe68f990eb6e059afe7a809b53f11e6b60a4c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 14 Apr 2023 14:46:05 -0400 Subject: [PATCH 25/26] Update python/cudf/cudf/core/df_protocol.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 2c542f59f17..a0663116c04 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -662,8 +662,9 @@ def from_dataframe( Parameters ---------- - df: Object supporting dataframe interchange protocol - allow_copy + df : DataFrameObject + Object supporting dataframe interchange protocol + allow_copy : bool If ``True``, allow copying of the data. If ``False``, a ``TypeError`` is raised if data copying is required to construct the ``DataFrame`` (e.g., if ``df`` lives in CPU From 862f587262db218af52dda697f3e2176830c0e7a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 28 Apr 2023 06:56:10 -0400 Subject: [PATCH 26/26] size is a method --- python/cudf/cudf/core/df_protocol.py | 1 - python/cudf/cudf/tests/test_df_protocol.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index a0663116c04..6e1c5f6fd00 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -160,7 +160,6 @@ def __init__( self._nan_as_null = nan_as_null self._allow_copy = allow_copy - @property def size(self) -> int: """ Size of the column, in elements. diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index fd550635b61..d6134c7bb01 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -92,7 +92,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): def assert_column_equal(col: _CuDFColumn, cudfcol): - assert col.size == cudfcol.size + assert col.size() == cudfcol.size assert col.offset == 0 assert col.null_count == cudfcol.null_count assert col.num_chunks() == 1