Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix consumption of CPU-backed interchange protocol dataframes #11392

Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
ca971da
Fix consumption of non-GPU backed protocol dataframes
shwina Jul 28, 2022
a39281b
test pandas 1.5 rc0
galipremsagar Aug 29, 2022
f87f232
temp commit
galipremsagar Aug 29, 2022
fc1647a
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 29, 2022
b7b3d76
initial pass of fixes
galipremsagar Aug 30, 2022
41d0381
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 30, 2022
1b92423
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 31, 2022
7095878
fix
galipremsagar Aug 31, 2022
01bd01e
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Aug 31, 2022
28d12db
more fixes
galipremsagar Aug 31, 2022
47eea3c
more fixes
galipremsagar Aug 31, 2022
8d53832
more fixes
galipremsagar Sep 1, 2022
a558088
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 1, 2022
5fae833
more fixes
galipremsagar Sep 1, 2022
988443d
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 1, 2022
d8d545e
fix
galipremsagar Sep 1, 2022
817bcf1
merge
galipremsagar Sep 2, 2022
ff61af7
update
galipremsagar Sep 2, 2022
092d20c
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 2, 2022
287756a
fix
galipremsagar Sep 2, 2022
f95fd4b
Merge remote-tracking branch 'upstream/branch-22.10' into pandas_1.5.x
galipremsagar Sep 6, 2022
c05790c
Merge branch 'branch-22.10' of https://github.com/rapidsai/cudf into …
shwina Sep 6, 2022
f92d742
Merge remote-tracking branch 'galipremsagar/pandas_1.5.x' into fix-in…
shwina Sep 6, 2022
c3a83c2
Merge branch 'branch-22.10' of https://github.com/rapidsai/cudf into …
shwina Sep 21, 2022
835d439
LT -> GE
shwina Sep 21, 2022
cb9c059
Undo changes introduced by upstream PR
shwina Sep 22, 2022
9602fba
Merge branch 'branch-22.10' of https://github.com/rapidsai/cudf into …
shwina Oct 11, 2022
1290afc
Merge branch 'branch-22.12' of https://github.com/rapidsai/cudf into …
shwina Oct 11, 2022
022991d
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fix-inte…
shwina Oct 28, 2022
4fd41e2
Add true support for CPU backed DFs
shwina Oct 31, 2022
98c8035
Lots of fixes
shwina Oct 31, 2022
9ccd440
More improvements
shwina Oct 31, 2022
f1cb5cb
Make the Pandas DF nullable
shwina Oct 31, 2022
2a5859d
use enum
shwina Oct 31, 2022
1ec350d
Int -> enum
shwina Oct 31, 2022
ecd91d2
Merge branch 'branch-23.02' into fix-interchange-protocol-error-cross…
shwina Dec 2, 2022
48c7e7d
Merge branch 'branch-23.06' of github.com:rapidsai/cudf into fix-inte…
shwina Apr 5, 2023
e4d1d4f
Add docstring for from_dataframe
shwina Apr 5, 2023
2525790
Use ints to construct IntEnum
shwina Apr 5, 2023
e0f3275
Review suggestions
shwina Apr 5, 2023
99e7c65
Merge branch 'branch-23.06' into fix-interchange-protocol-error-cross…
vyasr Apr 13, 2023
0837055
Merge branch 'branch-23.06' of github.com:rapidsai/cudf into fix-inte…
shwina Apr 14, 2023
f48422f
Fix categorical handling and add tests for nulls
shwina Apr 14, 2023
138b4d2
Merge branch 'fix-interchange-protocol-error-cross-device' of github.…
shwina Apr 14, 2023
e90fe68
Update python/cudf/cudf/core/df_protocol.py
shwina Apr 14, 2023
0ac28ed
Merge branch 'branch-23.06' into fix-interchange-protocol-error-cross…
shwina Apr 28, 2023
862f587
size is a method
shwina Apr 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 20 additions & 22 deletions python/cudf/cudf/core/df_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,13 +660,8 @@ def from_dataframe(
if not hasattr(df, "__dataframe__"):
raise ValueError("`df` does not support __dataframe__")

return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
df = df.__dataframe__(allow_copy=allow_copy)
shwina marked this conversation as resolved.
Show resolved Hide resolved


def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
"""
Create a cudf DataFrame object from DataFrameObject.
"""
# Check number of chunks, if there's more than one we need to iterate
if df.num_chunks() > 1:
raise NotImplementedError("More than one chunk not handled yet")
Expand All @@ -683,13 +678,19 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
_DtypeKind.FLOAT,
_DtypeKind.BOOL,
):
columns[name], _buf = _protocol_to_cudf_column_numeric(col)
columns[name], _buf = _protocol_to_cudf_column_numeric(
col, allow_copy
)

elif col.dtype[0] == _DtypeKind.CATEGORICAL:
columns[name], _buf = _protocol_to_cudf_column_categorical(col)
columns[name], _buf = _protocol_to_cudf_column_categorical(
col, allow_copy
)

elif col.dtype[0] == _DtypeKind.STRING:
columns[name], _buf = _protocol_to_cudf_column_string(col)
columns[name], _buf = _protocol_to_cudf_column_string(
col, allow_copy
)

else:
raise NotImplementedError(
Expand All @@ -704,7 +705,7 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:


def _protocol_to_cudf_column_numeric(
col: _CuDFColumn,
col: _CuDFColumn, allow_copy: bool
) -> Tuple[
cudf.core.column.ColumnBase,
Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
Expand All @@ -719,25 +720,22 @@ def _protocol_to_cudf_column_numeric(
buffers = col.get_buffers()
assert buffers["data"] is not None, "data buffer should not be None"
_dbuffer, _ddtype = buffers["data"]
_check_buffer_is_on_gpu(_dbuffer)
_check_buffer_is_on_gpu(_dbuffer, allow_copy)
cudfcol_num = build_column(
Buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None),
protocol_dtype_to_cupy_dtype(_ddtype),
)
return _set_missing_values(col, cudfcol_num), buffers


def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
if (
buffer.__dlpack_device__()[0] != _Device.CUDA
and not buffer._allow_copy
):
def _check_buffer_is_on_gpu(buffer: _CuDFBuffer, allow_copy: bool) -> None:
if buffer.__dlpack_device__()[0] != _Device.CUDA and not allow_copy:
raise TypeError(
"This operation must copy data from CPU to GPU. "
"Set `allow_copy=True` to allow it."
)

elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy:
elif buffer.__dlpack_device__()[0] != _Device.CUDA and allow_copy:
raise NotImplementedError(
"Only cuDF/GPU dataframes are supported for now. "
"CPU (like `Pandas`) dataframes will be supported shortly."
Expand Down Expand Up @@ -771,7 +769,7 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:


def _protocol_to_cudf_column_categorical(
col: _CuDFColumn,
col: _CuDFColumn, allow_copy: bool
) -> Tuple[
cudf.core.column.ColumnBase,
Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
Expand All @@ -789,7 +787,7 @@ def _protocol_to_cudf_column_categorical(
buffers = col.get_buffers()
assert buffers["data"] is not None, "data buffer should not be None"
codes_buffer, codes_dtype = buffers["data"]
_check_buffer_is_on_gpu(codes_buffer)
_check_buffer_is_on_gpu(codes_buffer, allow_copy)
cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
codes = build_column(
Buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize, owner=None),
Expand All @@ -808,7 +806,7 @@ def _protocol_to_cudf_column_categorical(


def _protocol_to_cudf_column_string(
col: _CuDFColumn,
col: _CuDFColumn, allow_copy: bool
) -> Tuple[
cudf.core.column.ColumnBase,
Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
Expand All @@ -822,7 +820,7 @@ def _protocol_to_cudf_column_string(
# Retrieve the data buffer containing the UTF-8 code units
assert buffers["data"] is not None, "data buffer should never be None"
data_buffer, data_dtype = buffers["data"]
_check_buffer_is_on_gpu(data_buffer)
_check_buffer_is_on_gpu(data_buffer, allow_copy)
encoded_string = build_column(
Buffer(data=data_buffer.ptr, size=data_buffer.bufsize, owner=None),
protocol_dtype_to_cupy_dtype(data_dtype),
Expand All @@ -832,7 +830,7 @@ def _protocol_to_cudf_column_string(
# the beginning and end of each string
assert buffers["offsets"] is not None, "not possible for string column"
offset_buffer, offset_dtype = buffers["offsets"]
_check_buffer_is_on_gpu(offset_buffer)
_check_buffer_is_on_gpu(offset_buffer, allow_copy)
offsets = build_column(
Buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize, owner=None),
protocol_dtype_to_cupy_dtype(offset_dtype),
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,6 @@ def from_arrow(cls, typ):
return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed)

def to_arrow(self):

return ArrowIntervalType(
pa.from_numpy_dtype(self.subtype), self.closed
)
Expand Down
54 changes: 35 additions & 19 deletions python/cudf/cudf/tests/test_df_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,25 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_150
from cudf.core.buffer import Buffer
from cudf.core.column import build_column
from cudf.core.df_protocol import (
DataFrameObject,
_CuDFBuffer,
_CuDFColumn,
_DtypeKind,
_from_dataframe,
from_dataframe,
protocol_dtype_to_cupy_dtype,
)
from cudf.testing._utils import assert_eq


@pytest.fixture
def pandas_df():
return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})


def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
buf, dtype = buffer_and_dtype
device_id = cp.asarray(cudfcol.data).device.id
Expand Down Expand Up @@ -91,31 +97,31 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
assert_column_equal(dfo.get_column_by_name(col), df[col]._column)


def assert_from_dataframe_equals(dfobj):
df2 = _from_dataframe(dfobj)
def assert_from_dataframe_equals(dfobj, allow_copy):
df2 = from_dataframe(dfobj, allow_copy=allow_copy)

assert_dataframe_equal(dfobj, df2)
if isinstance(dfobj._df, cudf.DataFrame):
assert_eq(dfobj._df, df2)
assert_dataframe_equal(dfobj.__dataframe__(allow_copy), df2)
if isinstance(dfobj, cudf.DataFrame):
assert_eq(dfobj, df2)

elif isinstance(dfobj._df, pd.DataFrame):
assert_eq(cudf.DataFrame(dfobj._df), df2)
elif isinstance(dfobj, pd.DataFrame):
assert_eq(cudf.DataFrame(dfobj), df2)

else:
raise TypeError(f"{type(dfobj._df)} not supported yet.")
raise TypeError(f"{type(dfobj)} not supported yet.")


def assert_from_dataframe_exception(dfobj):
def test_from_dataframe_exception(pandas_df):
exception_msg = "This operation must copy data from CPU to GPU."
" Set `allow_copy=True` to allow it."
with pytest.raises(TypeError, match=exception_msg):
_from_dataframe(dfobj)
from_dataframe(pandas_df)


def assert_df_unique_dtype_cols(data):
cdf = cudf.DataFrame(data=data)
assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
assert_from_dataframe_equals(cdf, allow_copy=False)
assert_from_dataframe_equals(cdf, allow_copy=True)


def test_from_dataframe():
Expand Down Expand Up @@ -144,8 +150,8 @@ def test_categorical_dtype():
col = cdf.__dataframe__().get_column_by_name("A")
assert col.dtype[0] == _DtypeKind.CATEGORICAL
assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
assert_from_dataframe_equals(cdf, allow_copy=False)
assert_from_dataframe_equals(cdf, allow_copy=True)


def test_bool_dtype():
Expand Down Expand Up @@ -199,8 +205,8 @@ def test_NA_categorical_dtype():
assert col.describe_null == (3, 0)
assert col.num_chunks() == 1
assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
assert_from_dataframe_equals(df, allow_copy=False)
assert_from_dataframe_equals(df, allow_copy=True)


def test_NA_bool_dtype():
Expand All @@ -219,8 +225,8 @@ def test_NA_string_dtype():
assert col.null_count == 1
assert col.describe_null == (3, 0)
assert col.num_chunks() == 1
assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
assert_from_dataframe_equals(df, allow_copy=False)
assert_from_dataframe_equals(df, allow_copy=True)


def test_NA_mixed_dtype():
Expand All @@ -232,3 +238,13 @@ def test_NA_mixed_dtype():
string=[None, None, None, "df protocol", None],
)
assert_df_unique_dtype_cols(data_mixed)


@pytest.mark.skipif(
not PANDAS_GE_150,
reason="Pandas versions < 1.5.0 do not support interchange protocol",
)
def test_from_cpu_df(pandas_df):
df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
with pytest.raises(NotImplementedError):
cudf.from_dataframe(df, allow_copy=True)
shwina marked this conversation as resolved.
Show resolved Hide resolved