From 8fc7e17b0b6356dfb70759bd19c19ca3a7bf68a8 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 19 Aug 2021 11:30:10 +0000 Subject: [PATCH 01/60] dataframe protocol implementation, support only int, float, categorical without missing values --- python/cudf/cudf/core/dataframe.py | 5 + python/cudf/cudf/core/df_protocol.py | 672 +++++++++++++++++++++ python/cudf/cudf/tests/test_df_protocol.py | 65 ++ 3 files changed, 742 insertions(+) create mode 100644 python/cudf/cudf/core/df_protocol.py create mode 100644 python/cudf/cudf/tests/test_df_protocol.py diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0aafae0a85b..4388ad20c53 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7683,3 +7683,8 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str): pass else: raise e + +from cudf.core.df_protocol import __dataframe__, from_dataframe + +DataFrame.__dataframe__ = __dataframe__ +DataFrame.from_dataframe = from_dataframe diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py new file mode 100644 index 00000000000..d252c881be0 --- /dev/null +++ b/python/cudf/cudf/core/df_protocol.py @@ -0,0 +1,672 @@ +""" +Implementation of the dataframe exchange protocol. + +Public API +---------- + +from_dataframe : construct a pandas.DataFrame from an input data frame which + implements the exchange protocol + +Notes +----- + +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to + do in pure Python. It's more general but definitely less friendly than having + ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack + ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), + this is worth looking at again. + +""" + +import enum +import collections +import ctypes +from typing import Any, Optional, Tuple, Dict, Iterable, Sequence + +import cudf +import numpy as np +import cupy as cp +import pandas._testing as tm +import cudf.testing as testcase +import pytest + + +# A typing protocol could be added later to let Mypy validate code using +# `from_dataframe` better. +DataFrameObject = Any +ColumnObject = Any + + +def from_dataframe(df : DataFrameObject, copy: bool = False) : + """ + Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` + """ + if isinstance(df, cudf.DataFrame): + return df + + if not hasattr(df, '__dataframe__'): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(), copy=copy) + + +def _from_dataframe(df : DataFrameObject, copy: bool = False) : + """ + Note: not all cases are handled yet, only ones that can be implemented with + only Pandas. Later, we need to implement/test support for categoricals, + bit/byte masks, chunk handling, etc. + """ + # Check number of chunks, if there's more than one we need to iterate + if df.num_chunks() > 1: + raise NotImplementedError + + # We need a dict of columns here, with each column being a numpy array (at + # least for now, deal with non-numpy dtypes later). + columns = dict() + _k = _DtypeKind + for name in df.column_names(): + col = df.get_column_by_name(name) + if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + # Simple numerical or bool dtype, turn into numpy array + columns[name] = convert_column_to_cupy_ndarray(col, copy=copy) + elif col.dtype[0] == _k.CATEGORICAL: + columns[name] = convert_categorical_column(col, copy=copy) + names = df.column_names() + else: + raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") + + return cudf.DataFrame(columns) + + + +class _DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray: + """ + Convert an int, uint, float or bool column to a numpy array + """ + if col.offset != 0: + raise NotImplementedError("column.offset > 0 not handled yet") + + if col.describe_null[0] not in (0, 1): + raise NotImplementedError("Null values represented as masks or " + "sentinel values not handled yet") + + _buffer, _dtype = col.get_data_buffer() + return buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy) + +def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray: + if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA + x = cp.fromDlpack(_buffer.__dlpack__()) + + elif copy == False: + raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") + + else: + x = _copy_buffer_to_gpu(_buffer, _dtype) + + return x + + +def _copy_buffer_to_gpu(_buffer, _dtype): + # Handle the dtype + kind = _dtype[0] + bitwidth = _dtype[1] + _k = _DtypeKind + if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + raise RuntimeError("Not a boolean, integer or floating-point dtype") + + _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} + _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} + _floats = {32: np.float32, 64: np.float64} + _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} + column_dtype = _np_dtypes[kind][bitwidth] + + # No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) + + # NOTE: `x` does not own its memory, so the caller of this function must + # either make a copy or hold on to a reference of the column or + # buffer! (not done yet, this is pretty awful ...) + x = np.ctypeslib.as_array(data_pointer, + shape=(_buffer.bufsize // (bitwidth//8),)) + return cp.array(x, dtype=column_dtype) + + +def convert_categorical_column(col : ColumnObject, copy:bool=False) : + """ + Convert a categorical column to a Series instance + """ + + + ordered, is_dict, mapping = col.describe_categorical + if not is_dict: + raise NotImplementedError('Non-dictionary categoricals not supported yet') + + # If you want to cheat for testing (can't use `_col` in real-world code): + # categories = col._col.values.categories.values + # codes = col._col.values.codes + categories = cp.asarray(list(mapping.values())) + codes_buffer, codes_dtype = col.get_data_buffer() + codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy) + values = categories[codes] + + # Seems like Pandas can only construct with non-null values, so need to + # null out the nulls later + cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered) + series = cudf.Series(cat) + null_kind = col.describe_null[0] + if null_kind == 2: # sentinel value + sentinel = col.describe_null[1] + series[codes == sentinel] = None + else: + raise NotImplementedError("Only categorical columns with sentinel " + "value supported at the moment") + + return series + + +def __dataframe__(self, nan_as_null : bool = False) -> dict: + """ + , target_device:str = 'gpu' + The public method to attach to cudf.DataFrame + + We'll attach it via monkeypatching here for demo purposes. If Pandas adopt + the protocol, this will be a regular method on pandas.DataFrame. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + ``target_device`` specifies the device where the returned dataframe protocol + object will live. Only `cpu` and `gpu` are supported for now. + """ + # if target_device not in ['cpu', 'gpu']: + # raise TypeError (f'Device {device} support not handle.') + + # if device == 'cpu': + # raise TypeError("This operation will copy data from GPU to CPU. Set `copy=True` to allow it.") + + + return _CuDFDataFrame(self, nan_as_null=nan_as_null) + + +# Monkeypatch the Pandas DataFrame class to support the interchange protocol +# cudf.DataFrame.__dataframe__ = __dataframe__ + + +# Implementation of interchange protocol +# -------------------------------------- + +class _CuDFBuffer: + + """ + Data in the buffer is guaranteed to be contiguous in memory. + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + + + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x : cp.ndarray) -> None: + """ + Handle only regular columns (= cupy arrays) for now. + """ + if not x.strides == (x.dtype.itemsize,): + # Array is not contiguous - this is possible to get in Pandas, + # there was some discussion on whether to support it. Some extra + # complexity for libraries that don't support it (e.g. Arrow), + # but would help with cupy-based libraries like CuDF. + raise RuntimeError("Design needs fixing - non-contiguous buffer") + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes + """ + return self._x.data.mem.size + # return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer + """ + # return self._x.data.mem.ptr + return self._x.__cuda_array_interface__['data'][0] + + def __dlpack__(self): + + """ + Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + + + DLPack implemented in CuPy + """ + try: + res = self._x.toDlpack() + except ValueError: + raise TypeError(f'dtype {self._x.dtype} unsupported by `dlpack`') + + return res + + def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: + + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. Enum members are:: + - CPU = 1 + - CUDA = 2 + - CPU_PINNED = 3 + - OPENCL = 4 + - VULKAN = 7 + - METAL = 8 + - VPI = 9 + - ROCM = 10 + Note: must be implemented even if ``__dlpack__`` is not. + + + Device type and device ID for where the data in the buffer resides. + """ + class Device(enum.IntEnum): + CUDA = 2 + + return (Device.CUDA, self._x.device.id) + + def __repr__(self) -> str: + return 'CuDFBuffer(' + str({'bufsize': self.bufsize, + 'ptr': self.ptr, + 'dlpack': self.__dlpack__(), + 'device': self.__dlpack_device__()[0].name} + ) + ')' + +class _CuDFColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain either one + or two buffers - one data buffer and (depending on null representation) it + may have a mask buffer. + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + + """ + + def __init__(self, column) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if not isinstance(column, cudf.Series): + raise NotImplementedError(f"Columns of type {type(column)} not handled yet") + + # Store the column as a private attribute + self._col = column + + @property + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + TODO: check `Always zero (in case of cudf)?` + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return 0 + + @property + def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` + + Kind : + + - INT = 0 + - UINT = 1 + - FLOAT = 2 + - BOOL = 20 + - STRING = 21 # UTF-8 + - DATETIME = 22 + - CATEGORICAL = 23 + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.dtype + return self._dtype_from_cudfdtype(dtype) + + def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: + """ + See `self.dtype` for details + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + _k = _DtypeKind + _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL, + 'U': _k.STRING, + 'M': _k.DATETIME, 'm': _k.DATETIME} + kind = _np_kinds.get(dtype.kind, None) + if kind is None: + # Not a NumPy dtype. Check if it's a categorical maybe + # CuPy uses NumPy dtypes. + if isinstance(dtype, cudf.CategoricalDtype): + kind = 23 + # Codes and categorical values dtypes are different. + # We use codes' dtype as these are stored in the buffer. + dtype = self._col.cat.codes.dtype + else: + raise ValueError(f"Data type {dtype} not supported by exchange" + "protocol") + + if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL): + raise NotImplementedError(f"Data type {dtype} not handled yet") + + bitwidth = dtype.itemsize * 8 + format_str = dtype.str + endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '=' + return (kind, bitwidth, format_str, endianness) + + + @property + def describe_categorical(self) -> Dict[str, Any]: + """ + If the dtype is categorical, there are two options: + + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + + Raises RuntimeError if the dtype is not categorical + + Content of returned dict: + + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + + + TBD: are there any other in-memory representations that are needed? + """ + if not self.dtype[0] == _DtypeKind.CATEGORICAL: + raise TypeError("`describe_categorical only works on a column with " + "categorical dtype!") + + ordered = self._col.dtype.ordered + is_dictionary = True + # NOTE: this shows the children approach is better, transforming + # `categories` to a "mapping" dict is inefficient + codes = self._col.cat.codes # ndarray, length `self.size` + # categories.values is ndarray of length n_categories + categories = self._col.cat.categories + mapping = {ix: val for ix, val in enumerate(categories.values_host)} + return ordered, is_dictionary, mapping + + @property + def describe_null(self) -> Tuple[int, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Kind: + + - 0 : non-nullable + - 1 : NaN/NaT + - 2 : sentinel value + - 3 : bit mask + - 4 : byte mask + + Value : if kind is "sentinel value", the actual value. None otherwise. + """ + _k = _DtypeKind + kind = self.dtype[0] + value = None + if kind == _k.FLOAT: + null = 1 # np.nan + elif kind == _k.DATETIME: + null = 1 # np.datetime64('NaT') + elif kind in (_k.INT, _k.UINT, _k.BOOL): + # TODO: check if extension dtypes are used once support for them is + # implemented in this procotol code + null = 0 # integer and boolean dtypes are non-nullable + elif kind == _k.CATEGORICAL: + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.cat.codes` is uint8 np.ndarray at least) + null = 2 + value = -1 + else: + raise NotImplementedError(f'Data type {self.dtype} not yet supported') + + return null, value + + @property + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + return self._col.isna().sum() + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + + TBC: Seems like chunks are used for parallel computation purpose in cudf:`apply_chunks`. + """ + return 1 + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + return (self,) + + def get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data. + """ + _k = _DtypeKind + if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + buffer = _CuDFBuffer(cp.array(self._col.to_gpu_array(), copy=False)) + dtype = self.dtype + elif self.dtype[0] == _k.CATEGORICAL: + _, value = self.describe_null + codes = self._col.cat.codes + # handling null/NaN + buffer = _CuDFBuffer(cp.array(codes.fillna(100), copy=False)) + dtype = self._dtype_from_cudfdtype(codes.dtype) + else: + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + + return buffer, dtype + + def get_mask(self) -> _CuDFBuffer: + """ + Return the buffer containing the mask values indicating missing data. + + Raises RuntimeError if null representation is not a bit or byte mask. + """ + null, value = self.describe_null + if null == 0: + msg = "This column is non-nullable so does not have a mask" + elif null == 1: + msg = "This column uses NaN as null so does not have a separate mask" + else: + raise NotImplementedError('See self.describe_null') + + raise RuntimeError(msg) + + # def get_children(self) -> Iterable[Column]: + # """ + # Children columns underneath the column, each object in this iterator + # must adhere to the column specification + # """ + # pass + +class _CuDFDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + Instances of this (private) class are returned from + ``cudf.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + def __init__(self, df, nan_as_null : bool = False) -> None: + """ + , device:str = 'gpu' + Constructor - an instance of this (private) class is returned from + `cudf.DataFrame.__dataframe__`. + """ + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + # This currently has no effect; once support for nullable extension + # dtypes is added, this value should be propagated to columns. + # + # ``device`` indicates the target device for the data. + self._nan_as_null = nan_as_null + self._df = df + + def num_columns(self) -> int: + return len(self._df.columns) + + def num_rows(self) -> int: + return len(self._df) + + def num_chunks(self) -> int: + return 1 + + def column_names(self) -> Iterable[str]: + return self._df.columns.tolist() + + def get_column(self, i: int) -> _CuDFColumn: + return _CuDFColumn(self._df.iloc[:, i]) + + def get_column_by_name(self, name: str) -> _CuDFColumn: + return _CuDFColumn(self._df[name]) + + def get_columns(self) -> Iterable[_CuDFColumn]: + return [_CuDFColumn(self._df[name]) for name in self._df.columns] + + def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame': + if not isinstance(indices, collections.Sequence): + raise ValueError("`indices` is not a sequence") + + return _CuDFDataFrame(self._df.iloc[:, indices]) + + def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame': + """ + Create a new DataFrame by selecting a subset of columns by name. + + Don't use pandas.DataFrame `xs` method as : + def xs(self, key, axis=0, level=None, drop_level: bool_t = True): + + Return cross-section from the Series/DataFrame. + + This method takes a `key` argument to select data at a particular + level of a MultiIndex. + """ + if not isinstance(names, collections.Sequence): + raise ValueError("`names` is not a sequence") + + return _CuDFDataFrame(self._df.loc[:, names]) + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']: + """ + Return an iterator yielding the chunks. + """ + return (self,) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py new file mode 100644 index 00000000000..ff720ac807c --- /dev/null +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -0,0 +1,65 @@ +import datetime +import cupy +import numpy as np +import pytest +from cudf.core import df_protocol + +import cudf +from cudf.testing import _utils as utils +from cudf.testing._utils import ( + ALL_TYPES, + DATETIME_TYPES, + NUMERIC_TYPES, + assert_eq, + assert_exceptions_equal, + does_not_raise, + gen_rand, +) + + +def _from_dataframe_equals(df, copy=False): + df2 = df_protocol._from_dataframe(df.__dataframe__(), copy=copy) + assert_eq(df, df2) + +def _from_dataframe_exception(df): + exception_msg = "This operation must copy data from CPU to GPU. Set `copy=True` to allow it." + with pytest.raises(TypeError, match=exception_msg): + df2 = from_dataframe(df, copy=False) + +def _datatype(data): + cdf = cudf.DataFrame(data=data) + _from_dataframe_equals(cdf, copy=False) + _from_dataframe_equals(cdf, copy=True) + + +def test_int_dtype(): + data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) + _datatype(data_int) + +def test_float_dtype(): + data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) + _datatype(data_float) + +def test_mixed_intfloat_dtype(): + data_intfloat = dict(a=[1, 2, 3], b=[1.5, 2.5, 3.5]) + _datatype(data_intfloat) + +def test_categorical_dtype(): + + def test__dataframe__(df): + # Some detailed testing for correctness of dtype: + col = df.__dataframe__().get_column_by_name('A') + assert col.dtype[0] == df_protocol._DtypeKind.CATEGORICAL + assert col.null_count == 0 + assert col.num_chunks() == 1 + assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) + + cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) + cdf["A"] = cdf["A"].astype("category") + test__dataframe__(cdf) + _from_dataframe_equals(cdf, copy=False) + _from_dataframe_equals(cdf, copy=True) + +# def test_bool_dtype(): +# data_bool = dict(a=[True, True, False], b=[False, True, False]) +# _datatype(data_bool) \ No newline at end of file From 4367d8f5e6c2b27eeed19111fb3d3a3f1e8713f2 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 19 Aug 2021 12:52:07 +0000 Subject: [PATCH 02/60] refactor to call from_dataframe on cudf directly and __dataframe__() on the dataframe object --- python/cudf/cudf/__init__.py | 1 + python/cudf/cudf/core/__init__.py | 2 +- python/cudf/cudf/core/dataframe.py | 13 ++++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 13c20d8bcd4..112fbe118ad 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -41,6 +41,7 @@ UInt64Index, cut, from_pandas, + from_dataframe, interval_range, merge, ) diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 016aba2edb3..7e825d38b7f 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -2,7 +2,7 @@ from cudf.core import _internals, buffer, column, column_accessor, common from cudf.core.buffer import Buffer -from cudf.core.dataframe import DataFrame, from_pandas, merge +from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe from cudf.core.index import ( BaseIndex, CategoricalIndex, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4388ad20c53..2be9e37c35f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -53,6 +53,8 @@ numeric_normalize_types, ) from cudf.utils.utils import GetAttrGetItemMixin +from cudf.core import df_protocol + T = TypeVar("T", bound="DataFrame") @@ -7410,6 +7412,11 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) + def __dataframe__(self, nan_as_null : bool = False): + return df_protocol.__dataframe__(self, nan_as_null=nan_as_null) + +def from_dataframe(df, copy = False): + return df_protocol.from_dataframe(df, copy=copy) def from_pandas(obj, nan_as_null=None): """ @@ -7684,7 +7691,7 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str): else: raise e -from cudf.core.df_protocol import __dataframe__, from_dataframe +# from cudf.core.df_protocol import __dataframe__, from_dataframe -DataFrame.__dataframe__ = __dataframe__ -DataFrame.from_dataframe = from_dataframe +# DataFrame.__dataframe__ = __dataframe__ +# DataFrame.from_dataframe = from_dataframe From 331c69618f2c4391d5400904dc202c2bf7c14776 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 19 Aug 2021 14:41:14 +0000 Subject: [PATCH 03/60] remove commented monkeypatch --- python/cudf/cudf/core/dataframe.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2be9e37c35f..38b2141b987 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7690,8 +7690,3 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str): pass else: raise e - -# from cudf.core.df_protocol import __dataframe__, from_dataframe - -# DataFrame.__dataframe__ = __dataframe__ -# DataFrame.from_dataframe = from_dataframe From c83b4d7669e9d980af7b70fa8d74930e02d64d97 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 19 Aug 2021 14:41:59 +0000 Subject: [PATCH 04/60] refactor test cases --- python/cudf/cudf/tests/test_df_protocol.py | 53 ++++++++++++++++------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index ff720ac807c..d27cb6d4a5d 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -2,7 +2,12 @@ import cupy import numpy as np import pytest -from cudf.core import df_protocol +from cudf.core.df_protocol import ( + _from_dataframe, + _DtypeKind, + __dataframe__, + _CuDFDataFrame +) import cudf from cudf.testing import _utils as utils @@ -15,41 +20,59 @@ does_not_raise, gen_rand, ) +import pandas as pd + + +def _test_from_dataframe_equals(dfobj, copy=False): + df2 = _from_dataframe(dfobj, copy=copy) + + if isinstance(dfobj._df, cudf.DataFrame): + assert_eq(dfobj._df, df2) + elif isinstance(dfobj._df, pd.DataFrame): + assert_eq(cudf.DataFrame(dfobj._df), df2) -def _from_dataframe_equals(df, copy=False): - df2 = df_protocol._from_dataframe(df.__dataframe__(), copy=copy) - assert_eq(df, df2) + else: + raise TypeError(f"{type(dfobj._df)} not supported yet.") -def _from_dataframe_exception(df): + +def _test_from_dataframe_exception(dfobj): exception_msg = "This operation must copy data from CPU to GPU. Set `copy=True` to allow it." with pytest.raises(TypeError, match=exception_msg): - df2 = from_dataframe(df, copy=False) + df2 = _from_dataframe(dfobj, copy=False) -def _datatype(data): +def _test_datatype(data): cdf = cudf.DataFrame(data=data) - _from_dataframe_equals(cdf, copy=False) - _from_dataframe_equals(cdf, copy=True) + cdfobj = cdf.__dataframe__() + print(cdfobj) + _test_from_dataframe_equals(cdfobj, copy=False) + _test_from_dataframe_equals(cdfobj, copy=True) + + # pdf = pd.DataFrame(data=data) + # cpu_dfobj = _CuDFDataFrame(pdf) + # _test_from_dataframe_exception(cpu_dfobj) + # _test_from_dataframe_equals(cpu_dfobj, copy=True) + def test_int_dtype(): data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) - _datatype(data_int) + _test_datatype(data_int) def test_float_dtype(): data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) - _datatype(data_float) + _test_datatype(data_float) def test_mixed_intfloat_dtype(): data_intfloat = dict(a=[1, 2, 3], b=[1.5, 2.5, 3.5]) - _datatype(data_intfloat) + _test_datatype(data_intfloat) def test_categorical_dtype(): def test__dataframe__(df): # Some detailed testing for correctness of dtype: col = df.__dataframe__().get_column_by_name('A') - assert col.dtype[0] == df_protocol._DtypeKind.CATEGORICAL + assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.null_count == 0 assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) @@ -57,8 +80,8 @@ def test__dataframe__(df): cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) cdf["A"] = cdf["A"].astype("category") test__dataframe__(cdf) - _from_dataframe_equals(cdf, copy=False) - _from_dataframe_equals(cdf, copy=True) + _test_from_dataframe_equals(cdf.__dataframe__(), copy=False) + _test_from_dataframe_equals(cdf.__dataframe__(), copy=True) # def test_bool_dtype(): # data_bool = dict(a=[True, True, False], b=[False, True, False]) From defcbc57c48e4cd5891f34273ff584c7bd9debfe Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 19 Aug 2021 22:40:08 +0000 Subject: [PATCH 05/60] propagate nan_as_null from DataFrame to Column class --- python/cudf/cudf/core/df_protocol.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index d252c881be0..eae7bf9e9f7 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -348,7 +348,7 @@ class _CuDFColumn: """ - def __init__(self, column) -> None: + def __init__(self, column, nan_as_null=False) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -358,6 +358,7 @@ def __init__(self, column) -> None: # Store the column as a private attribute self._col = column + self._nan_as_null = nan_as_null @property def size(self) -> int: From 7c197205060e092bc097d669ef856e98718c0f89 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 26 Aug 2021 11:21:13 +0000 Subject: [PATCH 06/60] start missing value supports + int missing value tests --- python/cudf/cudf/core/df_protocol.py | 69 ++++++++++++++-------- python/cudf/cudf/tests/test_df_protocol.py | 9 +++ 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index eae7bf9e9f7..f4da1ab7efc 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -96,14 +96,15 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - if col.describe_null[0] not in (0, 1): - raise NotImplementedError("Null values represented as masks or " - "sentinel values not handled yet") + # if col.describe_null[0] not in (0, 1): + # raise NotImplementedError("Null values represented as masks or " + # "sentinel values not handled yet") _buffer, _dtype = col.get_data_buffer() - return buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy) + _mask_buffer = col.get_mask() + return buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer, copy=copy) -def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray: +def buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer = None, copy : bool = False) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA x = cp.fromDlpack(_buffer.__dlpack__()) @@ -158,7 +159,8 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : # codes = col._col.values.codes categories = cp.asarray(list(mapping.values())) codes_buffer, codes_dtype = col.get_data_buffer() - codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy) + _mask_buffer = col.get_mask() + codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, _mask_buffer, copy=copy) values = categories[codes] # Seems like Pandas can only construct with non-null values, so need to @@ -513,19 +515,21 @@ def describe_null(self) -> Tuple[int, Any]: _k = _DtypeKind kind = self.dtype[0] value = None - if kind == _k.FLOAT: - null = 1 # np.nan - elif kind == _k.DATETIME: - null = 1 # np.datetime64('NaT') - elif kind in (_k.INT, _k.UINT, _k.BOOL): - # TODO: check if extension dtypes are used once support for them is - # implemented in this procotol code - null = 0 # integer and boolean dtypes are non-nullable - elif kind == _k.CATEGORICAL: - # Null values for categoricals are stored as `-1` sentinel values - # in the category date (e.g., `col.cat.codes` is uint8 np.ndarray at least) - null = 2 - value = -1 + if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): + null = 3 + # if kind == _k.FLOAT: + # null = 1 # np.nan + # elif kind == _k.DATETIME: + # null = 1 # np.datetime64('NaT') + # elif kind in (_k.INT, _k.UINT, _k.BOOL): + # # TODO: check if extension dtypes are used once support for them is + # # implemented in this procotol code + # null = 0 # integer and boolean dtypes are non-nullable + # elif kind == _k.CATEGORICAL: + # # Null values for categoricals are stored as `-1` sentinel values + # # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + # null = 2 + # value = -1 else: raise NotImplementedError(f'Data type {self.dtype} not yet supported') @@ -562,13 +566,13 @@ def get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype t """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer(cp.array(self._col.to_gpu_array(), copy=False)) + buffer = _CuDFBuffer(cp.array(self._col.fillna(0).to_gpu_array(), copy=False)) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: _, value = self.describe_null codes = self._col.cat.codes # handling null/NaN - buffer = _CuDFBuffer(cp.array(codes.fillna(100), copy=False)) + buffer = _CuDFBuffer(cp.array(codes.fillna(0), copy=False)) dtype = self._dtype_from_cudfdtype(codes.dtype) else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -582,14 +586,27 @@ def get_mask(self) -> _CuDFBuffer: Raises RuntimeError if null representation is not a bit or byte mask. """ null, value = self.describe_null + buffer = None if null == 0: msg = "This column is non-nullable so does not have a mask" elif null == 1: msg = "This column uses NaN as null so does not have a separate mask" + + elif null == 3: + + _k = _DtypeKind + if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + buffer = _CuDFBuffer(cp.array(self._col.nullmask, copy=False)) + elif self.dtype[0] == _k.CATEGORICAL: + codes = self._col.cat.codes.nullmask + # handling null/NaN + buffer = _CuDFBuffer(cp.array(codes, copy=False)) + else: raise NotImplementedError('See self.describe_null') - raise RuntimeError(msg) + return buffer + # def get_children(self) -> Iterable[Column]: # """ @@ -635,13 +652,13 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn(self._df.iloc[:, i]) + return _CuDFColumn(self._df.iloc[:, i], self._nan_as_null) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn(self._df[name]) + return _CuDFColumn(self._df[name], self._nan_as_null) def get_columns(self) -> Iterable[_CuDFColumn]: - return [_CuDFColumn(self._df[name]) for name in self._df.columns] + return [_CuDFColumn(self._df[name], self._nan_as_null) for name in self._df.columns] def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame': if not isinstance(indices, collections.Sequence): @@ -664,7 +681,7 @@ def xs(self, key, axis=0, level=None, drop_level: bool_t = True): if not isinstance(names, collections.Sequence): raise ValueError("`names` is not a sequence") - return _CuDFDataFrame(self._df.loc[:, names]) + return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null) def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']: """ diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d27cb6d4a5d..99917b0a2a8 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -83,6 +83,15 @@ def test__dataframe__(df): _test_from_dataframe_equals(cdf.__dataframe__(), copy=False) _test_from_dataframe_equals(cdf.__dataframe__(), copy=True) +def test_NA_int_dtype(): + data_int = dict(a=[1, None, 3], b=[9, 10, None]) + _test_datatype(data_int) + +# def test_NA2_int_dtype(): +# data_int = dict(a=[1, None, 3, None, 5], b=[9, 10, None, 7, 8]) +# _test_datatype(data_int) + + # def test_bool_dtype(): # data_bool = dict(a=[True, True, False], b=[False, True, False]) # _datatype(data_bool) \ No newline at end of file From 89d00f2c24bedb84ec422d0104f1644e1f749584 Mon Sep 17 00:00:00 2001 From: iskode Date: Mon, 13 Sep 2021 11:38:04 +0000 Subject: [PATCH 07/60] apply protocol update changes --- python/cudf/cudf/core/df_protocol.py | 404 +++++++++++++++------------ 1 file changed, 219 insertions(+), 185 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index f4da1ab7efc..ba5291fb08d 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -100,8 +100,8 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n # raise NotImplementedError("Null values represented as masks or " # "sentinel values not handled yet") - _buffer, _dtype = col.get_data_buffer() - _mask_buffer = col.get_mask() + _buffer, _dtype = col.get_buffers()['data'] + _mask_buffer = col.get_buffers()['validity'] return buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer, copy=copy) def buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer = None, copy : bool = False) -> cp.ndarray: @@ -158,8 +158,8 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : # categories = col._col.values.categories.values # codes = col._col.values.codes categories = cp.asarray(list(mapping.values())) - codes_buffer, codes_dtype = col.get_data_buffer() - _mask_buffer = col.get_mask() + codes_buffer, codes_dtype = col.get_buffers()['data'] + _mask_buffer = col.get_buffers()['validity'] codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, _mask_buffer, copy=copy) values = categories[codes] @@ -167,13 +167,18 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : # null out the nulls later cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered) series = cudf.Series(cat) + null_kind = col.describe_null[0] - if null_kind == 2: # sentinel value - sentinel = col.describe_null[1] - series[codes == sentinel] = None - else: - raise NotImplementedError("Only categorical columns with sentinel " - "value supported at the moment") + if null_kind != 0: + print(null_kind) + if null_kind == 2: # sentinel value + sentinel = col.describe_null[1] + series[codes == sentinel] = None + elif null_kind == 3: + pass + else: + raise NotImplementedError("Only categorical columns with sentinel " + "value supported at the moment") return series @@ -203,7 +208,6 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: return _CuDFDataFrame(self, nan_as_null=nan_as_null) - # Monkeypatch the Pandas DataFrame class to support the interchange protocol # cudf.DataFrame.__dataframe__ = __dataframe__ @@ -212,32 +216,22 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: # -------------------------------------- class _CuDFBuffer: - """ - Data in the buffer is guaranteed to be contiguous in memory. - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - This distinction is useful to support both data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. - - Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : cp.ndarray) -> None: + def __init__(self, x : cp.ndarray, allow_copy : bool = True) -> None: """ - Handle only regular columns (= cupy arrays) for now. + Handle only regular columns (= numpy arrays) for now. """ if not x.strides == (x.dtype.itemsize,): - # Array is not contiguous - this is possible to get in Pandas, - # there was some discussion on whether to support it. Some extra - # complexity for libraries that don't support it (e.g. Arrow), - # but would help with cupy-based libraries like CuDF. - raise RuntimeError("Design needs fixing - non-contiguous buffer") + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + x = x.copy() + else: + raise RuntimeError("Exports cannot be zero-copy in the case " + "of a non-contiguous buffer") # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -246,7 +240,7 @@ def __init__(self, x : cp.ndarray) -> None: @property def bufsize(self) -> int: """ - Buffer size in bytes + Buffer size in bytes. """ return self._x.data.mem.size # return self._x.size * self._x.dtype.itemsize @@ -254,23 +248,13 @@ def bufsize(self) -> int: @property def ptr(self) -> int: """ - Pointer to start of the buffer as an integer + Pointer to start of the buffer as an integer. """ - # return self._x.data.mem.ptr return self._x.__cuda_array_interface__['data'][0] def __dlpack__(self): - """ - Produce DLPack capsule (see array API standard). - Raises: - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - - - DLPack implemented in CuPy + DLPack not implemented in NumPy yet, so leave it out here. """ try: res = self._x.toDlpack() @@ -280,25 +264,11 @@ def __dlpack__(self): return res def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. Enum members are:: - - CPU = 1 - - CUDA = 2 - - CPU_PINNED = 3 - - OPENCL = 4 - - VULKAN = 7 - - METAL = 8 - - VPI = 9 - - ROCM = 10 - Note: must be implemented even if ``__dlpack__`` is not. - - Device type and device ID for where the data in the buffer resides. """ class Device(enum.IntEnum): - CUDA = 2 + CUDA = 2 return (Device.CUDA, self._x.device.id) @@ -314,61 +284,36 @@ class _CuDFColumn: A column object, with only the methods and properties required by the interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain either one - or two buffers - one data buffer and (depending on null representation) it - may have a mask buffer. - - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? - + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. """ - def __init__(self, column, nan_as_null=False) -> None: + def __init__(self, column, + nan_as_null : bool = True, + allow_copy: bool = False) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ if not isinstance(column, cudf.Series): - raise NotImplementedError(f"Columns of type {type(column)} not handled yet") + raise NotImplementedError("Columns of type {} not handled " + "yet".format(type(column))) # Store the column as a private attribute self._col = column self._nan_as_null = nan_as_null + self._allow_copy = allow_copy @property def size(self) -> int: """ Size of the column, in elements. - - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. """ return self._col.size @@ -376,11 +321,6 @@ def size(self) -> int: def offset(self) -> int: """ Offset of first element. Always zero. - TODO: check `Always zero (in case of cudf)?` - - May be > 0 if using chunks; for example for a column with N chunks of - equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. """ return 0 @@ -425,23 +365,27 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: and nested (list, struct, map, union) dtypes. """ dtype = self._col.dtype + + # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings + if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == 'O': + return (_DtypeKind.STRING, 8, 'u', '=') + return self._dtype_from_cudfdtype(dtype) def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: """ - See `self.dtype` for details + See `self.dtype` for details. """ # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) _k = _DtypeKind - _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL, - 'U': _k.STRING, - 'M': _k.DATETIME, 'm': _k.DATETIME} + _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL, + "U": _k.STRING, + "M": _k.DATETIME, "m": _k.DATETIME} kind = _np_kinds.get(dtype.kind, None) if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe - # CuPy uses NumPy dtypes. if isinstance(dtype, cudf.CategoricalDtype): kind = 23 # Codes and categorical values dtypes are different. @@ -451,7 +395,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: raise ValueError(f"Data type {dtype} not supported by exchange" "protocol") - if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL): + if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING): raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 @@ -459,7 +403,6 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '=' return (kind, bitwidth, format_str, endianness) - @property def describe_categorical(self) -> Dict[str, Any]: """ @@ -478,9 +421,6 @@ def describe_categorical(self) -> Dict[str, Any]: categorical values to other objects exists - "mapping" : dict, Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. - - - TBD: are there any other in-memory representations that are needed? """ if not self.dtype[0] == _DtypeKind.CATEGORICAL: raise TypeError("`describe_categorical only works on a column with " @@ -510,28 +450,25 @@ def describe_null(self) -> Tuple[int, Any]: - 3 : bit mask - 4 : byte mask - Value : if kind is "sentinel value", the actual value. None otherwise. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. """ - _k = _DtypeKind - kind = self.dtype[0] - value = None - if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): - null = 3 - # if kind == _k.FLOAT: - # null = 1 # np.nan - # elif kind == _k.DATETIME: - # null = 1 # np.datetime64('NaT') - # elif kind in (_k.INT, _k.UINT, _k.BOOL): - # # TODO: check if extension dtypes are used once support for them is - # # implemented in this procotol code - # null = 0 # integer and boolean dtypes are non-nullable - # elif kind == _k.CATEGORICAL: - # # Null values for categoricals are stored as `-1` sentinel values - # # in the category date (e.g., `col.values.codes` is int8 np.ndarray) - # null = 2 - # value = -1 - else: - raise NotImplementedError(f'Data type {self.dtype} not yet supported') + if self.null_count == 0: + # there is no validity mask in this case + # so making it non-nullable (hackingly) + null = 0 + value = None + else : + _k = _DtypeKind + kind = self.dtype[0] + # bit mask is universally used in cudf for missing + if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, + _k.STRING, _k.DATETIME): + null = 3 + value = 0 + else: + raise NotImplementedError(f"Data type {self.dtype} not yet supported") return null, value @@ -539,16 +476,19 @@ def describe_null(self) -> Tuple[int, Any]: def null_count(self) -> int: """ Number of null elements. Should always be known. - - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. """ return self._col.isna().sum() + @property + def metadata(self) -> Dict[str, Any]: + """ + Store specific metadata of the column. + """ + return {} + def num_chunks(self) -> int: """ Return the number of chunks the column consists of. - - TBC: Seems like chunks are used for parallel computation purpose in cudf:`apply_chunks`. """ return 1 @@ -560,60 +500,156 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn'] """ return (self,) - def get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype tuple + def get_buffers(self) -> Dict[str, Any]: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers = {} + buffers["data"] = self._get_data_buffer() + try: + buffers["validity"] = self._get_validity_buffer() + except: + buffers["validity"] = None + + try: + buffers["offsets"] = self._get_offsets_buffer() + except: + buffers["offsets"] = None + + return buffers + + def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype tuple """ - Return the buffer containing the data. + Return the buffer containing the data and the buffer's associated dtype. """ _k = _DtypeKind + invalid = self.describe_null[1] if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer(cp.array(self._col.fillna(0).to_gpu_array(), copy=False)) + buffer = _CuDFBuffer( + cp.array(self._col.fillna(invalid).to_gpu_array(), copy=False), + allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: - _, value = self.describe_null codes = self._col.cat.codes - # handling null/NaN - buffer = _CuDFBuffer(cp.array(codes.fillna(0), copy=False)) + buffer = _CuDFBuffer( + cp.array(codes.fillna(invalid), copy=False), + allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(codes.dtype) + # elif self.dtype[0] == _k.STRING: + # # Marshal the strings from a NumPy object array into a byte array + # buf = self._col.to_numpy() + # b = bytearray() + + # # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later + # for i in range(buf.size): + # if type(buf[i]) == str: + # b.extend(buf[i].encode(encoding="utf-8")) + + # # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store + # buffer = _CuDFBuffer(np.frombuffer(b, dtype="uint8")) + + # # Define the dtype for the returned buffer + # dtype = (_k.STRING, 8, "u", "=") # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") return buffer, dtype - def get_mask(self) -> _CuDFBuffer: + def unpackbits(myarray, bitorder="big"): + + bitorder_op = {"big": '(myarray[i / 8] >> (7 - i % 8)) & 1;', + "little": '(myarray[i / 8] >> (i % 8)) & 1;'} + operation = bitorder_op.get(bitorder, None) + if operation == None: + raise KeyError(f"bitorder must be either 'big' or 'little' not '{bitorder}'") + _unpackbits_kernel = _core.ElementwiseKernel( + 'raw uint8 myarray', 'T unpacked', + 'unpacked = '+ operation, + 'unpackbits_kernel' + ) + + if myarray.dtype != cupy.uint8: + raise TypeError('Expected an input array of unsigned byte data type') + + unpacked = cupy.ndarray((myarray.size * 8), dtype=cupy.uint8) + return _unpackbits_kernel(myarray, unpacked) + + def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: """ - Return the buffer containing the mask values indicating missing data. + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. Raises RuntimeError if null representation is not a bit or byte mask. """ - null, value = self.describe_null - buffer = None - if null == 0: - msg = "This column is non-nullable so does not have a mask" - elif null == 1: - msg = "This column uses NaN as null so does not have a separate mask" - - elif null == 3: - + + null, invalid = self.describe_null + if null == 3: _k = _DtypeKind - if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer(cp.array(self._col.nullmask, copy=False)) - elif self.dtype[0] == _k.CATEGORICAL: - codes = self._col.cat.codes.nullmask - # handling null/NaN - buffer = _CuDFBuffer(cp.array(codes, copy=False)) + bitmask = unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)] + buffer = _CuDFBuffer(bitmask) + dtype = (_k.UINT, 8, "C", "=") + return buffer, dtype + elif null == 1: + msg = "This column uses NaN as null so does not have a separate mask" + elif null == 0: + msg = "This column is non-nullable so does not have a mask" else: - raise NotImplementedError('See self.describe_null') + raise NotImplementedError("See self.describe_null") - return buffer + raise RuntimeError(msg) + def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. - # def get_children(self) -> Iterable[Column]: - # """ - # Children columns underneath the column, each object in this iterator - # must adhere to the column specification - # """ - # pass + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. + """ + _k = _DtypeKind + if self.dtype[0] == _k.STRING: + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = [ptr] + for v in values: + # For missing values (in this case, `np.nan` values), we don't increment the pointer) + if type(v) == str: + b = v.encode(encoding="utf-8") + ptr += len(b) + + offsets.append(ptr) + + # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + buf = cp.asarray(offsets, dtype="int64") + + # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + buffer = _CuDFBuffer(buf) + + # Assemble the buffer dtype info + dtype = (_k.INT, 64, 'l', "=") # note: currently only support native endianness + else: + raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + + return buffer, dtype class _CuDFDataFrame: """ @@ -624,20 +660,25 @@ class _CuDFDataFrame: ``cudf.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. """ - def __init__(self, df, nan_as_null : bool = False) -> None: + def __init__(self, df, nan_as_null : bool = True, + allow_copy : bool = True) -> None: """ - , device:str = 'gpu' Constructor - an instance of this (private) class is returned from `cudf.DataFrame.__dataframe__`. """ + self._df = df # ``nan_as_null`` is a keyword intended for the consumer to tell the # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). # This currently has no effect; once support for nullable extension # dtypes is added, this value should be propagated to columns. - # - # ``device`` indicates the target device for the data. self._nan_as_null = nan_as_null - self._df = df + self._allow_copy = allow_copy + + @property + def metadata(self): + # `index` isn't a regular column, and the protocol doesn't support row + # labels - so we export it as Pandas-specific metadata here. + return {"cudf.index": self._df.index} def num_columns(self) -> int: return len(self._df.columns) @@ -652,36 +693,29 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn(self._df.iloc[:, i], self._nan_as_null) + return _CuDFColumn( + self._df.iloc[:, i], allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn(self._df[name], self._nan_as_null) + return _CuDFColumn( + self._df[name], allow_copy=self._allow_copy) def get_columns(self) -> Iterable[_CuDFColumn]: - return [_CuDFColumn(self._df[name], self._nan_as_null) for name in self._df.columns] + return [_CuDFColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns] def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame': if not isinstance(indices, collections.Sequence): raise ValueError("`indices` is not a sequence") return _CuDFDataFrame(self._df.iloc[:, indices]) - - def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame': - """ - Create a new DataFrame by selecting a subset of columns by name. - - Don't use pandas.DataFrame `xs` method as : - def xs(self, key, axis=0, level=None, drop_level: bool_t = True): - - Return cross-section from the Series/DataFrame. - This method takes a `key` argument to select data at a particular - level of a MultiIndex. - """ + def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame': if not isinstance(names, collections.Sequence): raise ValueError("`names` is not a sequence") - return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null) + return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null, + self.allow_copy) def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']: """ From 8450d7edfd8d2217341de35ee11129327c8633ff Mon Sep 17 00:00:00 2001 From: iskode Date: Mon, 13 Sep 2021 17:56:07 +0000 Subject: [PATCH 08/60] missing values support for int, float and categorical --- python/cudf/cudf/core/df_protocol.py | 84 +++++++++++----------- python/cudf/cudf/tests/test_df_protocol.py | 81 +++++++++++++++++---- 2 files changed, 108 insertions(+), 57 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index ba5291fb08d..b234cedc1b1 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -26,6 +26,7 @@ import cudf import numpy as np import cupy as cp +from cupy import _core import pandas._testing as tm import cudf.testing as testcase import pytest @@ -37,7 +38,7 @@ ColumnObject = Any -def from_dataframe(df : DataFrameObject, copy: bool = False) : +def from_dataframe(df : DataFrameObject, allow_copy: bool = False) : """ Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -47,14 +48,12 @@ def from_dataframe(df : DataFrameObject, copy: bool = False) : if not hasattr(df, '__dataframe__'): raise ValueError("`df` does not support __dataframe__") - return _from_dataframe(df.__dataframe__(), copy=copy) + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) def _from_dataframe(df : DataFrameObject, copy: bool = False) : """ - Note: not all cases are handled yet, only ones that can be implemented with - only Pandas. Later, we need to implement/test support for categoricals, - bit/byte masks, chunk handling, etc. + Create a cudf DataFrame object from DataFrameObject Interface. """ # Check number of chunks, if there's more than one we need to iterate if df.num_chunks() > 1: @@ -64,18 +63,22 @@ def _from_dataframe(df : DataFrameObject, copy: bool = False) : # least for now, deal with non-numpy dtypes later). columns = dict() _k = _DtypeKind + _buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): col = df.get_column_by_name(name) if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): # Simple numerical or bool dtype, turn into numpy array - columns[name] = convert_column_to_cupy_ndarray(col, copy=copy) + columns[name], _buf = convert_column_to_cupy_ndarray(col, copy=copy) elif col.dtype[0] == _k.CATEGORICAL: - columns[name] = convert_categorical_column(col, copy=copy) - names = df.column_names() + columns[name], _buf = convert_categorical_column(col, copy=copy) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") - - return cudf.DataFrame(columns) + + _buffers.append(_buf) + + df_new = cudf.DataFrame(columns) + df_new._buffers = _buffers + return df_new @@ -88,6 +91,16 @@ class _DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 +def set_missing_values(col, col_array): + series = cudf.Series(col_array) + null_kind, null_value = col.describe_null + if null_kind != 0: + assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." + _mask_buffer, _mask_dtype = col.get_buffers()["validity"] + bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype) + series[bitmask==null_value] = None + + return series def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray: """ @@ -96,15 +109,20 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - # if col.describe_null[0] not in (0, 1): - # raise NotImplementedError("Null values represented as masks or " - # "sentinel values not handled yet") - _buffer, _dtype = col.get_buffers()['data'] - _mask_buffer = col.get_buffers()['validity'] - return buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer, copy=copy) + if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA + x = cp.fromDlpack(_buffer.__dlpack__()) + + elif copy == False: + raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") -def buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer = None, copy : bool = False) -> cp.ndarray: + else: + x = _copy_buffer_to_gpu(_buffer, _dtype) + + return set_missing_values(col, x), _buffer + + +def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA x = cp.fromDlpack(_buffer.__dlpack__()) @@ -148,8 +166,6 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : """ Convert a categorical column to a Series instance """ - - ordered, is_dict, mapping = col.describe_categorical if not is_dict: raise NotImplementedError('Non-dictionary categoricals not supported yet') @@ -159,28 +175,13 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : # codes = col._col.values.codes categories = cp.asarray(list(mapping.values())) codes_buffer, codes_dtype = col.get_buffers()['data'] - _mask_buffer = col.get_buffers()['validity'] - codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, _mask_buffer, copy=copy) + codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy) values = categories[codes] - # Seems like Pandas can only construct with non-null values, so need to + # Seems like cudf can only construct with non-null values, so need to # null out the nulls later cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered) - series = cudf.Series(cat) - - null_kind = col.describe_null[0] - if null_kind != 0: - print(null_kind) - if null_kind == 2: # sentinel value - sentinel = col.describe_null[1] - series[codes == sentinel] = None - elif null_kind == 3: - pass - else: - raise NotImplementedError("Only categorical columns with sentinel " - "value supported at the moment") - - return series + return set_missing_values(col, cat), codes_buffer def __dataframe__(self, nan_as_null : bool = False) -> dict: @@ -243,7 +244,6 @@ def bufsize(self) -> int: Buffer size in bytes. """ return self._x.data.mem.size - # return self._x.size * self._x.dtype.itemsize @property def ptr(self) -> int: @@ -572,7 +572,7 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype return buffer, dtype - def unpackbits(myarray, bitorder="big"): + def _unpackbits(self, myarray, bitorder="big"): bitorder_op = {"big": '(myarray[i / 8] >> (7 - i % 8)) & 1;', "little": '(myarray[i / 8] >> (i % 8)) & 1;'} @@ -585,10 +585,10 @@ def unpackbits(myarray, bitorder="big"): 'unpackbits_kernel' ) - if myarray.dtype != cupy.uint8: + if myarray.dtype != cp.uint8: raise TypeError('Expected an input array of unsigned byte data type') - unpacked = cupy.ndarray((myarray.size * 8), dtype=cupy.uint8) + unpacked = cp.ndarray((myarray.size * 8), dtype=cp.uint8) return _unpackbits_kernel(myarray, unpacked) def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: @@ -602,7 +602,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: null, invalid = self.describe_null if null == 3: _k = _DtypeKind - bitmask = unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)] + bitmask = self._unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)] buffer = _CuDFBuffer(bitmask) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 99917b0a2a8..d8c5c2a1d49 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -6,7 +6,9 @@ _from_dataframe, _DtypeKind, __dataframe__, - _CuDFDataFrame + _CuDFDataFrame, + _CuDFColumn, + _CuDFBuffer ) import cudf @@ -21,11 +23,44 @@ gen_rand, ) import pandas as pd +from typing import Any, Tuple + +DataFrameObject = Any + +def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol:cudf.Series): + buf, dtype = buffer_dtype + assert buf.__dlpack_device__() == (2, 0) + +def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series): + assert col.size == cudfcol.size + assert col.offset == 0 + assert col.null_count == cudfcol.isna().sum() + assert col.num_chunks() == 1 + if col.null_count == 0 : + pytest.raises(RuntimeError, col._get_validity_buffer) + assert_buffer_equal(col._get_data_buffer(), cudfcol) + null_kind, null_value = col.describe_null + if col.null_count == 0: + assert null_kind == 0 + assert null_value == None + else: + assert null_kind == 3 + assert null_value == 0 + + +def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame): + assert dfo.num_columns() == len(df.columns) + assert dfo.num_rows() == len(df) + assert dfo.num_chunks() == 1 + assert dfo.column_names() == list(df.columns) + for col in df.columns: + assert_column_equal(dfo.get_column_by_name(col), df[col]) def _test_from_dataframe_equals(dfobj, copy=False): df2 = _from_dataframe(dfobj, copy=copy) + assert_dataframe_equal(dfobj, df2) if isinstance(dfobj._df, cudf.DataFrame): assert_eq(dfobj._df, df2) @@ -68,28 +103,44 @@ def test_mixed_intfloat_dtype(): _test_datatype(data_intfloat) def test_categorical_dtype(): - - def test__dataframe__(df): - # Some detailed testing for correctness of dtype: - col = df.__dataframe__().get_column_by_name('A') - assert col.dtype[0] == _DtypeKind.CATEGORICAL - assert col.null_count == 0 - assert col.num_chunks() == 1 - assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) cdf["A"] = cdf["A"].astype("category") - test__dataframe__(cdf) + col = cdf.__dataframe__().get_column_by_name('A') + assert col.dtype[0] == _DtypeKind.CATEGORICAL + assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) _test_from_dataframe_equals(cdf.__dataframe__(), copy=False) _test_from_dataframe_equals(cdf.__dataframe__(), copy=True) def test_NA_int_dtype(): - data_int = dict(a=[1, None, 3], b=[9, 10, None]) + data_int = dict(a=[1, None, 3, None, 5], + b=[9, 10, None, 7, 8], + c= [6, 19, 20, 100, 1000] ) _test_datatype(data_int) -# def test_NA2_int_dtype(): -# data_int = dict(a=[1, None, 3, None, 5], b=[9, 10, None, 7, 8]) -# _test_datatype(data_int) +def test_NA_float_dtype(): + data_float = dict(a=[1.4, None, 3.6, None, 5.2], + b=[9.7, 10.9, None, 7.8, 8.2], + c= [6.1, 19.2, 20.3, 100.4, 1000.5] ) + _test_datatype(data_float) + +def test_NA_categorical_dtype(): + df = cudf.DataFrame({"A": [1, 2, 5, 1]}) + df["B"] = df["A"].astype("category") + df.at[[1, 3], 'B'] = None # Set two items to null + + # Some detailed testing for correctness of dtype and null handling: + col = df.__dataframe__().get_column_by_name('B') + assert col.dtype[0] == _DtypeKind.CATEGORICAL + assert col.null_count == 2 + assert col.describe_null == (3, 0) # sentinel value -1 + assert col.num_chunks() == 1 + assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) + _test_from_dataframe_equals(df.__dataframe__(), copy=True) + _test_from_dataframe_equals(df.__dataframe__(), copy=False) + + # df2 = _from_dataframe(df.__dataframe__()) + # assert_dataframe_equal(df.__dataframe__(), df) + # tm.assert_frame_equal(df, df2) # def test_bool_dtype(): From ec842d62ab4e553eb5b12a7f9ace215bb5c058ef Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 14 Sep 2021 08:40:35 +0000 Subject: [PATCH 09/60] add boolean support w/ missing values --- python/cudf/cudf/core/df_protocol.py | 40 +++++++++++++++++----- python/cudf/cudf/tests/test_df_protocol.py | 10 ++++-- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index b234cedc1b1..ec6b9212fd5 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -110,21 +110,37 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n raise NotImplementedError("column.offset > 0 not handled yet") _buffer, _dtype = col.get_buffers()['data'] - if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA - x = cp.fromDlpack(_buffer.__dlpack__()) + x = buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy) + # if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA + # _k = _DtypeKind + # print(f'buffer dtype: {_dtype[0]}') + # if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, _k.BOOL): + # x = cp.fromDlpack(_buffer.__dlpack__()) + # if _dtype[0] == _k.BOOL: + # print(f'before booleanizing: {x}') + # x = x.astype(cp.bool_) + # print(f'after booleanizing: {x}') - elif copy == False: - raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") - else: - x = _copy_buffer_to_gpu(_buffer, _dtype) + # elif copy == False: + # raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") + + # else: + # x = _copy_buffer_to_gpu(_buffer, _dtype) return set_missing_values(col, x), _buffer def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA - x = cp.fromDlpack(_buffer.__dlpack__()) + _k = _DtypeKind + print(f'buffer dtype: {_dtype[0]}') + if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): + x = cp.fromDlpack(_buffer.__dlpack__()) + elif _dtype[0] == _k.BOOL: + x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_) + else: + raise TypeError(f"dtype {_dtype[0]} not supported yet !") elif copy == False: raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") @@ -464,7 +480,7 @@ def describe_null(self) -> Tuple[int, Any]: kind = self.dtype[0] # bit mask is universally used in cudf for missing if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, - _k.STRING, _k.DATETIME): + _k.BOOL, _k.STRING, _k.DATETIME): null = 3 value = 0 else: @@ -541,11 +557,17 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype """ _k = _DtypeKind invalid = self.describe_null[1] - if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT): buffer = _CuDFBuffer( cp.array(self._col.fillna(invalid).to_gpu_array(), copy=False), allow_copy=self._allow_copy) dtype = self.dtype + elif self.dtype[0] == _k.BOOL: + # convert bool to uint8 as dlpack does not support bool natively. + buffer = _CuDFBuffer( + cp.array(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8, copy=False), + allow_copy=self._allow_copy) + dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.cat.codes buffer = _CuDFBuffer( diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d8c5c2a1d49..f214807d75b 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -143,6 +143,10 @@ def test_NA_categorical_dtype(): # tm.assert_frame_equal(df, df2) -# def test_bool_dtype(): -# data_bool = dict(a=[True, True, False], b=[False, True, False]) -# _datatype(data_bool) \ No newline at end of file +def test_bool_dtype(): + data_bool = dict(a=[True, True, False], b=[False, True, False]) + _test_datatype(data_bool) + +def test_NA_bool_dtype(): + data_bool = dict(a=[None, True, False], b=[False, None, None]) + _test_datatype(data_bool) \ No newline at end of file From 13e0b95c6b593e24222629d8f12885519f82938a Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 14 Sep 2021 09:09:59 +0000 Subject: [PATCH 10/60] refactor 'convert_column_to_cupy_ndarray' and replace 'cp.array' to 'cp.asarray' to enforce zero-copy --- python/cudf/cudf/core/df_protocol.py | 107 +++++++++------------ python/cudf/cudf/tests/test_df_protocol.py | 5 - 2 files changed, 44 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index ec6b9212fd5..9aeb7188c9a 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -91,16 +91,6 @@ class _DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 -def set_missing_values(col, col_array): - series = cudf.Series(col_array) - null_kind, null_value = col.describe_null - if null_kind != 0: - assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." - _mask_buffer, _mask_dtype = col.get_buffers()["validity"] - bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype) - series[bitmask==null_value] = None - - return series def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray: """ @@ -111,47 +101,43 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n _buffer, _dtype = col.get_buffers()['data'] x = buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy) - # if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA - # _k = _DtypeKind - # print(f'buffer dtype: {_dtype[0]}') - # if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, _k.BOOL): - # x = cp.fromDlpack(_buffer.__dlpack__()) - # if _dtype[0] == _k.BOOL: - # print(f'before booleanizing: {x}') - # x = x.astype(cp.bool_) - # print(f'after booleanizing: {x}') - - - # elif copy == False: - # raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") - - # else: - # x = _copy_buffer_to_gpu(_buffer, _dtype) return set_missing_values(col, x), _buffer def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA - _k = _DtypeKind - print(f'buffer dtype: {_dtype[0]}') - if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): - x = cp.fromDlpack(_buffer.__dlpack__()) - elif _dtype[0] == _k.BOOL: - x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_) - else: - raise TypeError(f"dtype {_dtype[0]} not supported yet !") - - elif copy == False: - raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") - + x = _gpu_buffer_to_cupy(_buffer, _dtype) else: - x = _copy_buffer_to_gpu(_buffer, _dtype) + if not copy: + raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") + x = _cpu_buffer_to_cupy(_buffer, _dtype) return x +def set_missing_values(col, col_array): + series = cudf.Series(col_array) + null_kind, null_value = col.describe_null + if null_kind != 0: + assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." + _mask_buffer, _mask_dtype = col.get_buffers()["validity"] + bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype) + series[bitmask==null_value] = None + + return series -def _copy_buffer_to_gpu(_buffer, _dtype): +def _gpu_buffer_to_cupy(_buffer, _dtype): + _k = _DtypeKind + print(f'buffer dtype: {_dtype[0]}') + if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): + x = cp.fromDlpack(_buffer.__dlpack__()) + elif _dtype[0] == _k.BOOL: + x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_) + else: + raise NotImplementedError(f"Data type {_dtype[0]} not handled yet") + return x + +def _cpu_buffer_to_cupy(_buffer, _dtype): # Handle the dtype kind = _dtype[0] bitwidth = _dtype[1] @@ -175,7 +161,7 @@ def _copy_buffer_to_gpu(_buffer, _dtype): # buffer! (not done yet, this is pretty awful ...) x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth//8),)) - return cp.array(x, dtype=column_dtype) + return cp.asarray(x, dtype=column_dtype) def convert_categorical_column(col : ColumnObject, copy:bool=False) : @@ -200,12 +186,12 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : return set_missing_values(col, cat), codes_buffer -def __dataframe__(self, nan_as_null : bool = False) -> dict: +def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True) -> dict: """ - , target_device:str = 'gpu' - The public method to attach to cudf.DataFrame + The public method to attach to cudf.DataFrame. - We'll attach it via monkeypatching here for demo purposes. If Pandas adopt + We'll attach it via monkey-patching here for demo purposes. If Pandas adopts the protocol, this will be a regular method on pandas.DataFrame. ``nan_as_null`` is a keyword intended for the consumer to tell the @@ -213,20 +199,15 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. - ``target_device`` specifies the device where the returned dataframe protocol - object will live. Only `cpu` and `gpu` are supported for now. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + Currently, if the flag is set to ``False`` and a copy is needed, a + ``RuntimeError`` will be raised. """ - # if target_device not in ['cpu', 'gpu']: - # raise TypeError (f'Device {device} support not handle.') - - # if device == 'cpu': - # raise TypeError("This operation will copy data from GPU to CPU. Set `copy=True` to allow it.") - - - return _CuDFDataFrame(self, nan_as_null=nan_as_null) - -# Monkeypatch the Pandas DataFrame class to support the interchange protocol -# cudf.DataFrame.__dataframe__ = __dataframe__ + return _CuDFDataFrame( + self, nan_as_null=nan_as_null, allow_copy=allow_copy) # Implementation of interchange protocol @@ -401,7 +382,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: "M": _k.DATETIME, "m": _k.DATETIME} kind = _np_kinds.get(dtype.kind, None) if kind is None: - # Not a NumPy dtype. Check if it's a categorical maybe + # Not a NumPy/CuPy dtype. Check if it's a categorical maybe if isinstance(dtype, cudf.CategoricalDtype): kind = 23 # Codes and categorical values dtypes are different. @@ -559,19 +540,19 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype invalid = self.describe_null[1] if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT): buffer = _CuDFBuffer( - cp.array(self._col.fillna(invalid).to_gpu_array(), copy=False), + cp.asarray(self._col.fillna(invalid).to_gpu_array()), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.BOOL: # convert bool to uint8 as dlpack does not support bool natively. buffer = _CuDFBuffer( - cp.array(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8, copy=False), + cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.cat.codes buffer = _CuDFBuffer( - cp.array(codes.fillna(invalid), copy=False), + cp.asarray(codes.fillna(invalid)), allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(codes.dtype) # elif self.dtype[0] == _k.STRING: @@ -624,7 +605,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: null, invalid = self.describe_null if null == 3: _k = _DtypeKind - bitmask = self._unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)] + bitmask = self._unpackbits(cp.asarray(self._col._column.mask), bitorder='little')[:len(self._col)] buffer = _CuDFBuffer(bitmask) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index f214807d75b..b9eae721353 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -138,11 +138,6 @@ def test_NA_categorical_dtype(): _test_from_dataframe_equals(df.__dataframe__(), copy=True) _test_from_dataframe_equals(df.__dataframe__(), copy=False) - # df2 = _from_dataframe(df.__dataframe__()) - # assert_dataframe_equal(df.__dataframe__(), df) - # tm.assert_frame_equal(df, df2) - - def test_bool_dtype(): data_bool = dict(a=[True, True, False], b=[False, True, False]) _test_datatype(data_bool) From dfa02a2e9a40fbc032171a0410147868a0b79b6f Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 14 Sep 2021 14:54:16 +0000 Subject: [PATCH 11/60] change 'copy' to 'allow_copy' code wide --- python/cudf/cudf/core/dataframe.py | 10 +++-- python/cudf/cudf/core/df_protocol.py | 24 ++++++----- python/cudf/cudf/tests/test_df_protocol.py | 49 +++++++++++++--------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 38b2141b987..fca86e788a0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7412,11 +7412,13 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) - def __dataframe__(self, nan_as_null : bool = False): - return df_protocol.__dataframe__(self, nan_as_null=nan_as_null) + def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True): + return df_protocol.__dataframe__(self, nan_as_null=nan_as_null, + allow_copy=allow_copy) -def from_dataframe(df, copy = False): - return df_protocol.from_dataframe(df, copy=copy) +def from_dataframe(df, allow_copy = False): + return df_protocol.from_dataframe(df, allow_copy=allow_copy) def from_pandas(obj, nan_as_null=None): """ diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 9aeb7188c9a..3a5b70e72df 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -51,9 +51,9 @@ def from_dataframe(df : DataFrameObject, allow_copy: bool = False) : return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df : DataFrameObject, copy: bool = False) : +def _from_dataframe(df : DataFrameObject) : """ - Create a cudf DataFrame object from DataFrameObject Interface. + Create a cudf DataFrame object from DataFrameObject. """ # Check number of chunks, if there's more than one we need to iterate if df.num_chunks() > 1: @@ -68,9 +68,9 @@ def _from_dataframe(df : DataFrameObject, copy: bool = False) : col = df.get_column_by_name(name) if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): # Simple numerical or bool dtype, turn into numpy array - columns[name], _buf = convert_column_to_cupy_ndarray(col, copy=copy) + columns[name], _buf = convert_column_to_cupy_ndarray(col, allow_copy=col._allow_copy) elif col.dtype[0] == _k.CATEGORICAL: - columns[name], _buf = convert_categorical_column(col, copy=copy) + columns[name], _buf = convert_categorical_column(col, allow_copy=col._allow_copy) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") @@ -92,7 +92,7 @@ class _DtypeKind(enum.IntEnum): CATEGORICAL = 23 -def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray: +def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> cp.ndarray: """ Convert an int, uint, float or bool column to a numpy array """ @@ -100,17 +100,18 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n raise NotImplementedError("column.offset > 0 not handled yet") _buffer, _dtype = col.get_buffers()['data'] - x = buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy) + x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy) return set_missing_values(col, x), _buffer -def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray: +def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA x = _gpu_buffer_to_cupy(_buffer, _dtype) else: - if not copy: - raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.") + if not allow_copy: + raise TypeError("This operation must copy data from CPU to GPU." + "Set `allow_copy=True` to allow it.") x = _cpu_buffer_to_cupy(_buffer, _dtype) return x @@ -164,7 +165,7 @@ def _cpu_buffer_to_cupy(_buffer, _dtype): return cp.asarray(x, dtype=column_dtype) -def convert_categorical_column(col : ColumnObject, copy:bool=False) : +def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) : """ Convert a categorical column to a Series instance """ @@ -177,7 +178,8 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) : # codes = col._col.values.codes categories = cp.asarray(list(mapping.values())) codes_buffer, codes_dtype = col.get_buffers()['data'] - codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy) + codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, + allow_copy=allow_copy) values = categories[codes] # Seems like cudf can only construct with non-null values, so need to diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index b9eae721353..f89ddeeb0e3 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -38,6 +38,7 @@ def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series): assert col.num_chunks() == 1 if col.null_count == 0 : pytest.raises(RuntimeError, col._get_validity_buffer) + assert col.get_buffers()['validity'] == None assert_buffer_equal(col._get_data_buffer(), cudfcol) null_kind, null_value = col.describe_null if col.null_count == 0: @@ -57,8 +58,8 @@ def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame): assert_column_equal(dfo.get_column_by_name(col), df[col]) -def _test_from_dataframe_equals(dfobj, copy=False): - df2 = _from_dataframe(dfobj, copy=copy) +def _test_from_dataframe_equals(dfobj): + df2 = _from_dataframe(dfobj) assert_dataframe_equal(dfobj, df2) if isinstance(dfobj._df, cudf.DataFrame): @@ -72,23 +73,26 @@ def _test_from_dataframe_equals(dfobj, copy=False): def _test_from_dataframe_exception(dfobj): - exception_msg = "This operation must copy data from CPU to GPU. Set `copy=True` to allow it." + exception_msg = "This operation must copy data from CPU to GPU. Set `allow_copy=True` to allow it." with pytest.raises(TypeError, match=exception_msg): - df2 = _from_dataframe(dfobj, copy=False) + df2 = _from_dataframe(dfobj) def _test_datatype(data): cdf = cudf.DataFrame(data=data) - cdfobj = cdf.__dataframe__() - print(cdfobj) - _test_from_dataframe_equals(cdfobj, copy=False) - _test_from_dataframe_equals(cdfobj, copy=True) + _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) + _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) # pdf = pd.DataFrame(data=data) # cpu_dfobj = _CuDFDataFrame(pdf) # _test_from_dataframe_exception(cpu_dfobj) - # _test_from_dataframe_equals(cpu_dfobj, copy=True) + # _test_from_dataframe_equals(cpu_dfobj, allow_copy=True) +def test_from_dataframe(): + data = dict(a=[1, 2, 3], b=[9, 10, 11]) + df1 = cudf.DataFrame(data=data) + df2 = cudf.from_dataframe(df1) + assert_eq(df1, df2) def test_int_dtype(): data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) @@ -98,18 +102,24 @@ def test_float_dtype(): data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) _test_datatype(data_float) -def test_mixed_intfloat_dtype(): - data_intfloat = dict(a=[1, 2, 3], b=[1.5, 2.5, 3.5]) - _test_datatype(data_intfloat) - def test_categorical_dtype(): cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) cdf["A"] = cdf["A"].astype("category") col = cdf.__dataframe__().get_column_by_name('A') assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - _test_from_dataframe_equals(cdf.__dataframe__(), copy=False) - _test_from_dataframe_equals(cdf.__dataframe__(), copy=True) + _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) + _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + +def test_bool_dtype(): + data_bool = dict(a=[True, True, False], b=[False, True, False]) + _test_datatype(data_bool) + +def test_mixed_dtype(): + data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5], + bool=[True, False, True], categorical=[5, 1, 5]) + _test_datatype(data_mixed) + def test_NA_int_dtype(): data_int = dict(a=[1, None, 3, None, 5], @@ -135,12 +145,11 @@ def test_NA_categorical_dtype(): assert col.describe_null == (3, 0) # sentinel value -1 assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - _test_from_dataframe_equals(df.__dataframe__(), copy=True) - _test_from_dataframe_equals(df.__dataframe__(), copy=False) + _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) + _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + + -def test_bool_dtype(): - data_bool = dict(a=[True, True, False], b=[False, True, False]) - _test_datatype(data_bool) def test_NA_bool_dtype(): data_bool = dict(a=[None, True, False], b=[False, None, None]) From d0cd04c5b3f4a72e6239a1f8a57655fcbb006a2a Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 22 Sep 2021 03:19:59 +0000 Subject: [PATCH 12/60] make "from_dataframe" accessible cudf: cudf.from_dataframe --- python/cudf/cudf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 6b5e5b858f0..1d35682ae82 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -42,7 +42,7 @@ UInt64Index, interval_range, ) -from cudf.core.dataframe import DataFrame, from_pandas, merge +from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe from cudf.core.series import Series from cudf.core.multiindex import MultiIndex from cudf.core.cut import cut From 2e85f5d0cca987dc9d539f712bcb1a0989e301bd Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 30 Sep 2021 01:17:19 +0000 Subject: [PATCH 13/60] minor corrections + remove 'unpackbits' in favor of cudf's own function --- python/cudf/cudf/core/df_protocol.py | 77 +++++++++++----------------- 1 file changed, 29 insertions(+), 48 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index c6afb505623..3b1023f67e9 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -129,7 +129,6 @@ def set_missing_values(col, col_array): def _gpu_buffer_to_cupy(_buffer, _dtype): _k = _DtypeKind - print(f'buffer dtype: {_dtype[0]}') if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): x = cp.fromDlpack(_buffer.__dlpack__()) elif _dtype[0] == _k.BOOL: @@ -242,7 +241,7 @@ def bufsize(self) -> int: """ Buffer size in bytes. """ - return self._x.data.mem.size + return self._x.size * self._x.dtype.itemsize @property def ptr(self) -> int: @@ -577,25 +576,6 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype return buffer, dtype - def _unpackbits(self, myarray, bitorder="big"): - - bitorder_op = {"big": '(myarray[i / 8] >> (7 - i % 8)) & 1;', - "little": '(myarray[i / 8] >> (i % 8)) & 1;'} - operation = bitorder_op.get(bitorder, None) - if operation == None: - raise KeyError(f"bitorder must be either 'big' or 'little' not '{bitorder}'") - _unpackbits_kernel = _core.ElementwiseKernel( - 'raw uint8 myarray', 'T unpacked', - 'unpacked = '+ operation, - 'unpackbits_kernel' - ) - - if myarray.dtype != cp.uint8: - raise TypeError('Expected an input array of unsigned byte data type') - - unpacked = cp.ndarray((myarray.size * 8), dtype=cp.uint8) - return _unpackbits_kernel(myarray, unpacked) - def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and @@ -607,7 +587,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: null, invalid = self.describe_null if null == 3: _k = _DtypeKind - bitmask = self._unpackbits(cp.asarray(self._col._column.mask), bitorder='little')[:len(self._col)] + bitmask = cp.asarray(self._col._column._get_mask_as_column().to_gpu_array(), dtype=cp.uint8) buffer = _CuDFBuffer(bitmask) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype @@ -629,32 +609,33 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: Raises RuntimeError if the data buffer does not have an associated offsets buffer. """ - _k = _DtypeKind - if self.dtype[0] == _k.STRING: - # For each string, we need to manually determine the next offset - values = self._col.to_numpy() - ptr = 0 - offsets = [ptr] - for v in values: - # For missing values (in this case, `np.nan` values), we don't increment the pointer) - if type(v) == str: - b = v.encode(encoding="utf-8") - ptr += len(b) - - offsets.append(ptr) - - # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) - buf = cp.asarray(offsets, dtype="int64") - - # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store - buffer = _CuDFBuffer(buf) - - # Assemble the buffer dtype info - dtype = (_k.INT, 64, 'l', "=") # note: currently only support native endianness - else: - raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + # _k = _DtypeKind + # if self.dtype[0] == _k.STRING: + # # For each string, we need to manually determine the next offset + # values = self._col.to_numpy() + # ptr = 0 + # offsets = [ptr] + # for v in values: + # # For missing values (in this case, `np.nan` values), we don't increment the pointer) + # if type(v) == str: + # b = v.encode(encoding="utf-8") + # ptr += len(b) - return buffer, dtype + # offsets.append(ptr) + + # # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + # buf = cp.asarray(offsets, dtype="int64") + + # # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + # buffer = _CuDFBuffer(buf) + + # # Assemble the buffer dtype info + # dtype = (_k.INT, 64, 'l', "=") # note: currently only support native endianness + # else: + # raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + + # return buffer, dtype + pass class _CuDFDataFrame: """ @@ -720,7 +701,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame': raise ValueError("`names` is not a sequence") return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null, - self.allow_copy) + self._allow_copy) def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']: """ From 9ed11a3714d4408443a9fcd05d5549fdfe7a6b26 Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 1 Oct 2021 22:17:58 +0000 Subject: [PATCH 14/60] use cudf Column object as _CuDFColumn's _col attribute instead of cudf Series object --- python/cudf/cudf/core/df_protocol.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 3b1023f67e9..26dd07c1fd4 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -24,12 +24,9 @@ from typing import Any, Optional, Tuple, Dict, Iterable, Sequence import cudf +from cudf.core.column import as_column import numpy as np import cupy as cp -from cupy import _core -import pandas._testing as tm -import cudf.testing as testcase -import pytest # A typing protocol could be added later to let Mypy validate code using @@ -304,7 +301,7 @@ def __init__(self, column, "yet".format(type(column))) # Store the column as a private attribute - self._col = column + self._col = as_column(column) self._nan_as_null = nan_as_null self._allow_copy = allow_copy @@ -386,9 +383,9 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: # Not a NumPy/CuPy dtype. Check if it's a categorical maybe if isinstance(dtype, cudf.CategoricalDtype): kind = _k.CATEGORICAL - # Codes and categorical values dtypes are different. + # Codes and categories' dtypes are different. # We use codes' dtype as these are stored in the buffer. - dtype = self._col.cat.codes.dtype + dtype = self._col.codes.dtype else: raise ValueError(f"Data type {dtype} not supported by exchange" "protocol") @@ -428,9 +425,9 @@ def describe_categorical(self) -> Tuple[Any, bool, Dict[int, Any]]: is_dictionary = True # NOTE: this shows the children approach is better, transforming # `categories` to a "mapping" dict is inefficient - codes = self._col.cat.codes # ndarray, length `self.size` + codes = self._col.codes # ndarray, length `self.size` # categories.values is ndarray of length n_categories - categories = self._col.cat.categories + categories = self._col.categories mapping = {ix: val for ix, val in enumerate(categories.values_host)} return ordered, is_dictionary, mapping @@ -551,7 +548,7 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: - codes = self._col.cat.codes + codes = self._col.codes buffer = _CuDFBuffer( cp.asarray(codes.fillna(invalid)), allow_copy=self._allow_copy) @@ -587,7 +584,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: null, invalid = self.describe_null if null == 3: _k = _DtypeKind - bitmask = cp.asarray(self._col._column._get_mask_as_column().to_gpu_array(), dtype=cp.uint8) + bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8) buffer = _CuDFBuffer(bitmask) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype From ca9686d12a0fa68a6711f316cd202635e287e89a Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 6 Oct 2021 06:47:07 +0000 Subject: [PATCH 15/60] use cudf buffer in _CuDFBuffer class --- python/cudf/cudf/core/df_protocol.py | 84 ++++++++++++++++++---------- 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 26dd07c1fd4..bc1f905f13f 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -24,7 +24,8 @@ from typing import Any, Optional, Tuple, Dict, Iterable, Sequence import cudf -from cudf.core.column import as_column +from cudf.core.column import as_column, build_column, build_categorical_column +from cudf.core.buffer import Buffer import numpy as np import cupy as cp @@ -96,10 +97,20 @@ def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - _buffer, _dtype = col.get_buffers()['data'] - x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy) + _dbuffer, _ddtype = col.get_buffers()['data'] + dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype)) + null_kind, null_value = col.describe_null + if null_kind != 0: + _vbuffer, _vdtype = col.get_buffers()['validity'] + valid_mask = cp.asarray(Buffer(_vbuffer.ptr, _vbuffer.bufsize), cp.bool8) + dcol[~valid_mask] = None + + return dcol, _dbuffer + # Buffer(_vbuffer.ptr, _vbuffer.bufsize)if _vbuffer != None else None) + # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy) + # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy) - return set_missing_values(col, x), _buffer + # return set_missing_values(col, x), _buffer def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.ndarray: @@ -134,20 +145,25 @@ def _gpu_buffer_to_cupy(_buffer, _dtype): raise NotImplementedError(f"Data type {_dtype[0]} not handled yet") return x -def _cpu_buffer_to_cupy(_buffer, _dtype): - # Handle the dtype +def protocol_dtype_to_np_dtype(_dtype): + print(_dtype) kind = _dtype[0] bitwidth = _dtype[1] _k = _DtypeKind - if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - raise RuntimeError("Not a boolean, integer or floating-point dtype") + if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL, + _k.STRING, _k.DATETIME): + raise RuntimeError(f"Data type {_dtype[0]} not handled yet") _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} _floats = {32: np.float32, 64: np.float64} _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} - column_dtype = _np_dtypes[kind][bitwidth] + return _np_dtypes[kind][bitwidth] +def _cpu_buffer_to_cupy(_buffer, _dtype): + # Handle the dtype + + column_dtype = protocol_dtype_to_np_dtype(_dtype) # No DLPack yet, so need to construct a new ndarray from the data pointer # and size in the buffer plus the dtype on the column ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) @@ -216,18 +232,18 @@ class _CuDFBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : cp.ndarray, allow_copy : bool = True) -> None: + def __init__(self, x : Buffer, allow_copy : bool = True) -> None: """ - Handle only regular columns (= numpy arrays) for now. + Use cudf Buffer object. """ - if not x.strides == (x.dtype.itemsize,): - # The protocol does not support strided buffers, so a copy is - # necessary. If that's not allowed, we need to raise an exception. - if allow_copy: - x = x.copy() - else: - raise RuntimeError("Exports cannot be zero-copy in the case " - "of a non-contiguous buffer") + # if not x.strides == (x.dtype.itemsize,): + # # The protocol does not support strided buffers, so a copy is + # # necessary. If that's not allowed, we need to raise an exception. + # if allow_copy: + # x = x.copy() + # else: + # raise RuntimeError("Exports cannot be zero-copy in the case " + # "of a non-contiguous buffer") # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -238,21 +254,24 @@ def bufsize(self) -> int: """ Buffer size in bytes. """ - return self._x.size * self._x.dtype.itemsize + return self._x.nbytes + # return self._x.size * self._x.dtype.itemsize @property def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ - return self._x.__cuda_array_interface__['data'][0] - + return self._x.ptr + # return self._x.__cuda_array_interface__['data'][0] + def __dlpack__(self): """ DLPack not implemented in NumPy yet, so leave it out here. """ try: - res = self._x.toDlpack() + # res = self._x.toDlpack() + res = cp.asarray(self._x).toDlpack() except ValueError: raise TypeError(f'dtype {self._x.dtype} unsupported by `dlpack`') @@ -265,7 +284,7 @@ def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: class Device(enum.IntEnum): CUDA = 2 - return (Device.CUDA, self._x.device.id) + return (Device.CUDA, cp.asarray(self._x).device.id) def __repr__(self) -> str: return 'CuDFBuffer(' + str({'bufsize': self.bufsize, @@ -538,19 +557,22 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype invalid = self.describe_null[1] if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT): buffer = _CuDFBuffer( - cp.asarray(self._col.fillna(invalid).to_gpu_array()), + self._col.data, + # cp.asarray(self._col.fillna(invalid).to_gpu_array()), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.BOOL: # convert bool to uint8 as dlpack does not support bool natively. buffer = _CuDFBuffer( - cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8), + self._col.data, + # cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.codes buffer = _CuDFBuffer( - cp.asarray(codes.fillna(invalid)), + self._col.codes.data, + # cp.asarray(codes.fillna(invalid)), allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(codes.dtype) # elif self.dtype[0] == _k.STRING: @@ -584,8 +606,12 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: null, invalid = self.describe_null if null == 3: _k = _DtypeKind - bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8) - buffer = _CuDFBuffer(bitmask) + # bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8) + # buffer = _CuDFBuffer(bitmask) + if self.dtype[0] == _k.CATEGORICAL: + buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data) + else: + buffer = _CuDFBuffer(self._col._get_mask_as_column().data) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype From a3e65e47906d1f5897741b013f05b5b482b73c9b Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 7 Oct 2021 10:44:29 +0000 Subject: [PATCH 16/60] use buffer protocol instead of dlpack protocol as the latter doesn't work now --- python/cudf/cudf/core/df_protocol.py | 56 ++++++++-------------------- 1 file changed, 15 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index bc1f905f13f..4f13f5c7738 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -98,22 +98,11 @@ def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> raise NotImplementedError("column.offset > 0 not handled yet") _dbuffer, _ddtype = col.get_buffers()['data'] - dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype)) - null_kind, null_value = col.describe_null - if null_kind != 0: - _vbuffer, _vdtype = col.get_buffers()['validity'] - valid_mask = cp.asarray(Buffer(_vbuffer.ptr, _vbuffer.bufsize), cp.bool8) - dcol[~valid_mask] = None - - return dcol, _dbuffer - # Buffer(_vbuffer.ptr, _vbuffer.bufsize)if _vbuffer != None else None) - # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy) - # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy) + dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype)) + return set_missing_values(col, dcol), _dbuffer - # return set_missing_values(col, x), _buffer - -def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.ndarray: +def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = True) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA x = _gpu_buffer_to_cupy(_buffer, _dtype) else: @@ -124,16 +113,15 @@ def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.nda return x -def set_missing_values(col, col_array): - series = cudf.Series(col_array) +def set_missing_values(col, dcol): null_kind, null_value = col.describe_null if null_kind != 0: assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." _mask_buffer, _mask_dtype = col.get_buffers()["validity"] - bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype) - series[bitmask==null_value] = None + bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) + dcol[~bitmask] = None - return series + return dcol def _gpu_buffer_to_cupy(_buffer, _dtype): _k = _DtypeKind @@ -146,7 +134,6 @@ def _gpu_buffer_to_cupy(_buffer, _dtype): return x def protocol_dtype_to_np_dtype(_dtype): - print(_dtype) kind = _dtype[0] bitwidth = _dtype[1] _k = _DtypeKind @@ -185,19 +172,15 @@ def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) : if not is_dict: raise NotImplementedError('Non-dictionary categoricals not supported yet') - # If you want to cheat for testing (can't use `_col` in real-world code): - # categories = col._col.values.categories.values - # codes = col._col.values.codes - categories = cp.asarray(list(mapping.values())) + categories = as_column(mapping.values()) codes_buffer, codes_dtype = col.get_buffers()['data'] - codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, - allow_copy=allow_copy) - values = categories[codes] + cdtype = protocol_dtype_to_np_dtype(codes_dtype) + codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) + + col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, + size=codes.size,ordered=ordered) - # Seems like cudf can only construct with non-null values, so need to - # null out the nulls later - cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered) - return set_missing_values(col, cat), codes_buffer + return set_missing_values(col, col1), codes_buffer def __dataframe__(self, nan_as_null : bool = False, @@ -236,16 +219,7 @@ def __init__(self, x : Buffer, allow_copy : bool = True) -> None: """ Use cudf Buffer object. """ - # if not x.strides == (x.dtype.itemsize,): - # # The protocol does not support strided buffers, so a copy is - # # necessary. If that's not allowed, we need to raise an exception. - # if allow_copy: - # x = x.copy() - # else: - # raise RuntimeError("Exports cannot be zero-copy in the case " - # "of a non-contiguous buffer") - - # Store the numpy array in which the data resides as a private + # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes self._x = x From ae5fb811b133364e59ccf20e8e73e6b73c3ccf49 Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 8 Oct 2021 03:43:44 +0000 Subject: [PATCH 17/60] string support of the protocol --- python/cudf/cudf/core/df_protocol.py | 156 +++++++++++---------- python/cudf/cudf/tests/test_df_protocol.py | 26 +++- 2 files changed, 101 insertions(+), 81 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 4f13f5c7738..1255b056108 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -64,11 +64,16 @@ def _from_dataframe(df : DataFrameObject) : _buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): col = df.get_column_by_name(name) + if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - # Simple numerical or bool dtype, turn into numpy array - columns[name], _buf = convert_column_to_cupy_ndarray(col, allow_copy=col._allow_copy) + columns[name], _buf = convert_to_cudf_column(col) + elif col.dtype[0] == _k.CATEGORICAL: - columns[name], _buf = convert_categorical_column(col, allow_copy=col._allow_copy) + columns[name], _buf = convert_to_cudf_categorical(col) + + elif col.dtype[0] == _k.STRING: + columns[name], _buf = convert_to_cudf_string(col) + else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") @@ -90,7 +95,7 @@ class _DtypeKind(enum.IntEnum): CATEGORICAL = 23 -def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> cp.ndarray: +def convert_to_cudf_column(col:ColumnObject) -> cp.ndarray: """ Convert an int, uint, float or bool column to a numpy array """ @@ -98,15 +103,22 @@ def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> raise NotImplementedError("column.offset > 0 not handled yet") _dbuffer, _ddtype = col.get_buffers()['data'] - dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype)) + check_data_is_on_gpu(_dbuffer) + dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), + protocol_dtypes_to_cupy_dtype(_ddtype)) return set_missing_values(col, dcol), _dbuffer +def check_data_is_on_gpu(buffer): + + if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy: + raise TypeError("This operation must copy data from CPU to GPU." + "Set `allow_copy=True` to allow it.") -def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = True) -> cp.ndarray: +def buffer_to_cupy_ndarray(_buffer, _dtype) -> cp.ndarray: if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA x = _gpu_buffer_to_cupy(_buffer, _dtype) else: - if not allow_copy: + if not _buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU." "Set `allow_copy=True` to allow it.") x = _cpu_buffer_to_cupy(_buffer, _dtype) @@ -133,7 +145,7 @@ def _gpu_buffer_to_cupy(_buffer, _dtype): raise NotImplementedError(f"Data type {_dtype[0]} not handled yet") return x -def protocol_dtype_to_np_dtype(_dtype): +def protocol_dtypes_to_cupy_dtype(_dtype): kind = _dtype[0] bitwidth = _dtype[1] _k = _DtypeKind @@ -141,16 +153,16 @@ def protocol_dtype_to_np_dtype(_dtype): _k.STRING, _k.DATETIME): raise RuntimeError(f"Data type {_dtype[0]} not handled yet") - _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} - _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} - _floats = {32: np.float32, 64: np.float64} - _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} - return _np_dtypes[kind][bitwidth] + _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} + _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} + _floats = {32: cp.float32, 64: cp.float64} + _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} + return _cp_dtypes[kind][bitwidth] def _cpu_buffer_to_cupy(_buffer, _dtype): # Handle the dtype - column_dtype = protocol_dtype_to_np_dtype(_dtype) + column_dtype = protocol_dtypes_to_cupy_dtype(_dtype) # No DLPack yet, so need to construct a new ndarray from the data pointer # and size in the buffer plus the dtype on the column ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) @@ -164,7 +176,7 @@ def _cpu_buffer_to_cupy(_buffer, _dtype): return cp.asarray(x, dtype=column_dtype) -def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) : +def convert_to_cudf_categorical(col : ColumnObject) : """ Convert a categorical column to a Series instance """ @@ -174,7 +186,8 @@ def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) : categories = as_column(mapping.values()) codes_buffer, codes_dtype = col.get_buffers()['data'] - cdtype = protocol_dtype_to_np_dtype(codes_dtype) + check_data_is_on_gpu(codes_buffer) + cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype) codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, @@ -183,6 +196,32 @@ def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) : return set_missing_values(col, col1), codes_buffer +def convert_to_cudf_string(col : ColumnObject) : + """ + Convert a string ColumnObject to cudf Column object. + """ + # Retrieve the data buffers + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + dbuffer, bdtype = buffers["data"] + check_data_is_on_gpu(dbuffer) + encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize), + protocol_dtypes_to_cupy_dtype(bdtype) + ) + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + obuffer, odtype = buffers["offsets"] + offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), + protocol_dtypes_to_cupy_dtype(odtype) + ) + + col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) + + return set_missing_values(col, col_str), buffers + + + def __dataframe__(self, nan_as_null : bool = False, allow_copy : bool = True) -> dict: """ @@ -222,6 +261,7 @@ def __init__(self, x : Buffer, allow_copy : bool = True) -> None: # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes self._x = x + self._allow_copy = allow_copy @property def bufsize(self) -> int: @@ -284,7 +324,7 @@ class _CuDFColumn: def __init__(self, column, nan_as_null : bool = True, - allow_copy: bool = False) -> None: + allow_copy: bool = True) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -529,41 +569,21 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype """ _k = _DtypeKind invalid = self.describe_null[1] - if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT): - buffer = _CuDFBuffer( - self._col.data, - # cp.asarray(self._col.fillna(invalid).to_gpu_array()), - allow_copy=self._allow_copy) - dtype = self.dtype - elif self.dtype[0] == _k.BOOL: - # convert bool to uint8 as dlpack does not support bool natively. - buffer = _CuDFBuffer( - self._col.data, - # cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8), - allow_copy=self._allow_copy) + if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + buffer = _CuDFBuffer(self._col.data, allow_copy=self._allow_copy) dtype = self.dtype + elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.codes - buffer = _CuDFBuffer( - self._col.codes.data, - # cp.asarray(codes.fillna(invalid)), - allow_copy=self._allow_copy) + buffer = _CuDFBuffer(self._col.codes.data, allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(codes.dtype) - # elif self.dtype[0] == _k.STRING: - # # Marshal the strings from a NumPy object array into a byte array - # buf = self._col.to_numpy() - # b = bytearray() - - # # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later - # for i in range(buf.size): - # if type(buf[i]) == str: - # b.extend(buf[i].encode(encoding="utf-8")) - # # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store - # buffer = _CuDFBuffer(np.frombuffer(b, dtype="uint8")) + elif self.dtype[0] == _k.STRING: + encoded_string = self._col.children[1] + buffer = _CuDFBuffer(encoded_string.data, allow_copy=self._allow_copy) + dtype = self._dtype_from_cudfdtype(encoded_string.dtype) + # dtype = (_k.STRING, 8, "u", "=") - # # Define the dtype for the returned buffer - # dtype = (_k.STRING, 8, "u", "=") # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -580,12 +600,12 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: null, invalid = self.describe_null if null == 3: _k = _DtypeKind - # bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8) - # buffer = _CuDFBuffer(bitmask) if self.dtype[0] == _k.CATEGORICAL: - buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data) + buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, + allow_copy=self._allow_copy) else: - buffer = _CuDFBuffer(self._col._get_mask_as_column().data) + buffer = _CuDFBuffer(self._col._get_mask_as_column().data, + allow_copy=self._allow_copy) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype @@ -606,33 +626,15 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: Raises RuntimeError if the data buffer does not have an associated offsets buffer. """ - # _k = _DtypeKind - # if self.dtype[0] == _k.STRING: - # # For each string, we need to manually determine the next offset - # values = self._col.to_numpy() - # ptr = 0 - # offsets = [ptr] - # for v in values: - # # For missing values (in this case, `np.nan` values), we don't increment the pointer) - # if type(v) == str: - # b = v.encode(encoding="utf-8") - # ptr += len(b) - - # offsets.append(ptr) - - # # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) - # buf = cp.asarray(offsets, dtype="int64") - - # # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store - # buffer = _CuDFBuffer(buf) - - # # Assemble the buffer dtype info - # dtype = (_k.INT, 64, 'l', "=") # note: currently only support native endianness - # else: - # raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") - - # return buffer, dtype - pass + _k = _DtypeKind + if self.dtype[0] == _k.STRING: + offsets = self._col.children[0] + buffer = _CuDFBuffer(offsets.data, allow_copy=self._allow_copy) + dtype = self._dtype_from_cudfdtype(offsets.dtype) + else: + raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + + return buffer, dtype class _CuDFDataFrame: """ diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index f89ddeeb0e3..7936aa46ac5 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -115,6 +115,13 @@ def test_bool_dtype(): data_bool = dict(a=[True, True, False], b=[False, True, False]) _test_datatype(data_bool) + +def test_string_dtype(): + data_string = dict(a=["a", "b", "cdef", "", "g"]) + _test_datatype(data_string) + + + def test_mixed_dtype(): data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5], bool=[True, False, True], categorical=[5, 1, 5]) @@ -148,9 +155,20 @@ def test_NA_categorical_dtype(): _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) - - - def test_NA_bool_dtype(): data_bool = dict(a=[None, True, False], b=[False, None, None]) - _test_datatype(data_bool) \ No newline at end of file + _test_datatype(data_bool) + +def test_NA_string_dtype(): + df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]}) + df["B"] = df["A"].astype("object") + df.at[1, "B"] = cudf.NA # Set one item to null + + # Test for correctness and null handling: + col = df.__dataframe__().get_column_by_name("B") + assert col.dtype[0] == _DtypeKind.STRING + assert col.null_count == 1 + assert col.describe_null == (3, 0) + assert col.num_chunks() == 1 + _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) + _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) From ed7130b8c044373d148d75cd9b3291a6e050ae29 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 14 Oct 2021 17:48:16 +0000 Subject: [PATCH 18/60] rename class attribute 'x' into 'buf' --- python/cudf/cudf/core/df_protocol.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 1255b056108..376d68e3c95 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -254,13 +254,13 @@ class _CuDFBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : Buffer, allow_copy : bool = True) -> None: + def __init__(self, buf : Buffer, allow_copy : bool = True) -> None: """ Use cudf Buffer object. """ # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes - self._x = x + self._buf = buf self._allow_copy = allow_copy @property @@ -268,15 +268,14 @@ def bufsize(self) -> int: """ Buffer size in bytes. """ - return self._x.nbytes - # return self._x.size * self._x.dtype.itemsize + return self._buf.nbytes @property def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ - return self._x.ptr + return self._buf.ptr # return self._x.__cuda_array_interface__['data'][0] def __dlpack__(self): @@ -285,9 +284,9 @@ def __dlpack__(self): """ try: # res = self._x.toDlpack() - res = cp.asarray(self._x).toDlpack() + res = cp.asarray(self._buf).toDlpack() except ValueError: - raise TypeError(f'dtype {self._x.dtype} unsupported by `dlpack`') + raise TypeError(f'dtype {self._buf.dtype} unsupported by `dlpack`') return res @@ -298,7 +297,7 @@ def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: class Device(enum.IntEnum): CUDA = 2 - return (Device.CUDA, cp.asarray(self._x).device.id) + return (Device.CUDA, cp.asarray(self._buf).device.id) def __repr__(self) -> str: return 'CuDFBuffer(' + str({'bufsize': self.bufsize, From 9a6f957f2a2143af144fa04e0347593afd9bc6a1 Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 15 Oct 2021 07:32:48 +0000 Subject: [PATCH 19/60] add thorough tests for on '_CuDFcolumn' and '_CuDFBuffer' classes --- python/cudf/cudf/tests/test_df_protocol.py | 44 ++++++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 7936aa46ac5..413f02b8afd 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,17 +1,21 @@ import datetime -import cupy +import cupy as cp import numpy as np import pytest from cudf.core.df_protocol import ( _from_dataframe, _DtypeKind, - __dataframe__, + protocol_dtypes_to_cupy_dtype, + + _CuDFDataFrame, _CuDFColumn, _CuDFBuffer ) import cudf +from cudf.core.column import build_column +from cudf.core.buffer import Buffer from cudf.testing import _utils as utils from cudf.testing._utils import ( ALL_TYPES, @@ -27,11 +31,20 @@ DataFrameObject = Any -def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol:cudf.Series): +def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_dtype - assert buf.__dlpack_device__() == (2, 0) - -def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series): + device_id = cp.asarray(cudfcol.data).device.id + assert buf.__dlpack_device__() == (2, device_id) + col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize), + protocol_dtypes_to_cupy_dtype(dtype) + ) + # check that non null values are the equals as null are represented + # by sentinel values in the buffer. + non_null_idxs = cudfcol!=None + assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) + + +def assert_column_equal(col: _CuDFColumn, cudfcol): assert col.size == cudfcol.size assert col.offset == 0 assert col.null_count == cudfcol.isna().sum() @@ -39,7 +52,22 @@ def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series): if col.null_count == 0 : pytest.raises(RuntimeError, col._get_validity_buffer) assert col.get_buffers()['validity'] == None - assert_buffer_equal(col._get_data_buffer(), cudfcol) + else: + assert_buffer_equal(col.get_buffers()['validity'], + cudfcol._get_mask_as_column().astype(cp.uint8)) + + if col.dtype[0] == _DtypeKind.CATEGORICAL: + assert_buffer_equal(col.get_buffers()['data'], cudfcol.codes) + assert col.get_buffers()['offsets'] == None + + elif col.dtype[0] == _DtypeKind.STRING: + assert_buffer_equal(col.get_buffers()['data'], cudfcol.children[1]) + assert_buffer_equal(col.get_buffers()['offsets'], cudfcol.children[0]) + + else: + assert_buffer_equal(col.get_buffers()['data'], cudfcol) + assert col.get_buffers()['offsets'] == None + null_kind, null_value = col.describe_null if col.null_count == 0: assert null_kind == 0 @@ -55,7 +83,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame): assert dfo.num_chunks() == 1 assert dfo.column_names() == list(df.columns) for col in df.columns: - assert_column_equal(dfo.get_column_by_name(col), df[col]) + assert_column_equal(dfo.get_column_by_name(col), df[col]._column) def _test_from_dataframe_equals(dfobj): From f9ca94d86e7b93e1457abc785faadc1b560c1b9e Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 15 Oct 2021 09:01:46 +0000 Subject: [PATCH 20/60] write 'dlpack' support tests --- python/cudf/cudf/tests/test_df_protocol.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 413f02b8afd..d5f12d76085 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -42,6 +42,12 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): # by sentinel values in the buffer. non_null_idxs = cudfcol!=None assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) + array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) + col_array = cp.asarray(cudfcol.data_array_view) + # non_null_idxs = (col_array!=None) + assert_eq(array_from_dlpack.all(), col_array.all()) + print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}") + def assert_column_equal(col: _CuDFColumn, cudfcol): From 1709babdd0f6d5d79110d3c3668f3cb772fff26b Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 15 Oct 2021 09:23:50 +0000 Subject: [PATCH 21/60] dlpack support ok --- python/cudf/cudf/core/df_protocol.py | 23 +++++++++++++--------- python/cudf/cudf/tests/test_df_protocol.py | 14 ++++++++----- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 376d68e3c95..de25bea4563 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -28,6 +28,7 @@ from cudf.core.buffer import Buffer import numpy as np import cupy as cp +from numba import cuda # A typing protocol could be added later to let Mypy validate code using @@ -254,13 +255,14 @@ class _CuDFBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, buf : Buffer, allow_copy : bool = True) -> None: + def __init__(self, buf : Buffer, cudf_dtype, allow_copy : bool = True) -> None: """ Use cudf Buffer object. """ # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes self._buf = buf + self._cudf_dtype = cudf_dtype self._allow_copy = allow_copy @property @@ -284,9 +286,10 @@ def __dlpack__(self): """ try: # res = self._x.toDlpack() - res = cp.asarray(self._buf).toDlpack() + cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype) + res = cp.asarray(cudarray).toDlpack() except ValueError: - raise TypeError(f'dtype {self._buf.dtype} unsupported by `dlpack`') + raise TypeError(f'dtype {self._cudf_dtype} unsupported by `dlpack`') return res @@ -569,17 +572,19 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype _k = _DtypeKind invalid = self.describe_null[1] if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer(self._col.data, allow_copy=self._allow_copy) + buffer = _CuDFBuffer(self._col.data, self._col.dtype, + allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.codes - buffer = _CuDFBuffer(self._col.codes.data, allow_copy=self._allow_copy) + buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype, + allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(codes.dtype) elif self.dtype[0] == _k.STRING: encoded_string = self._col.children[1] - buffer = _CuDFBuffer(encoded_string.data, allow_copy=self._allow_copy) + buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(encoded_string.dtype) # dtype = (_k.STRING, 8, "u", "=") @@ -600,10 +605,10 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: if null == 3: _k = _DtypeKind if self.dtype[0] == _k.CATEGORICAL: - buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, + buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, cp.uint8, allow_copy=self._allow_copy) else: - buffer = _CuDFBuffer(self._col._get_mask_as_column().data, + buffer = _CuDFBuffer(self._col._get_mask_as_column().data, cp.uint8, allow_copy=self._allow_copy) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype @@ -628,7 +633,7 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: _k = _DtypeKind if self.dtype[0] == _k.STRING: offsets = self._col.children[0] - buffer = _CuDFBuffer(offsets.data, allow_copy=self._allow_copy) + buffer = _CuDFBuffer(offsets.data, offsets.dtype, allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(offsets.dtype) else: raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d5f12d76085..523fb0ce7a1 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -42,11 +42,15 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): # by sentinel values in the buffer. non_null_idxs = cudfcol!=None assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) - array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) - col_array = cp.asarray(cudfcol.data_array_view) - # non_null_idxs = (col_array!=None) - assert_eq(array_from_dlpack.all(), col_array.all()) - print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}") + + if dtype[0] != _DtypeKind.BOOL: + array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) + col_array = cp.asarray(cudfcol.data_array_view) + assert_eq(array_from_dlpack.all(), col_array.all()) + print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}") + else: + pytest.raises(TypeError, buf.__dlpack__) + From 25c4474629d4ae4e147751ed1217d018c68f76f4 Mon Sep 17 00:00:00 2001 From: iskode Date: Mon, 18 Oct 2021 08:36:17 +0000 Subject: [PATCH 22/60] use '_from_data' to create dataframe from columns --- python/cudf/cudf/core/df_protocol.py | 4 +--- python/cudf/cudf/tests/test_df_protocol.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index de25bea4563..905a9f8695c 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -80,7 +80,7 @@ def _from_dataframe(df : DataFrameObject) : _buffers.append(_buf) - df_new = cudf.DataFrame(columns) + df_new = cudf.DataFrame._from_data(columns) df_new._buffers = _buffers return df_new @@ -278,14 +278,12 @@ def ptr(self) -> int: Pointer to start of the buffer as an integer. """ return self._buf.ptr - # return self._x.__cuda_array_interface__['data'][0] def __dlpack__(self): """ DLPack not implemented in NumPy yet, so leave it out here. """ try: - # res = self._x.toDlpack() cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype) res = cp.asarray(cudarray).toDlpack() except ValueError: diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 523fb0ce7a1..2fe1652750d 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -47,7 +47,6 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) col_array = cp.asarray(cudfcol.data_array_view) assert_eq(array_from_dlpack.all(), col_array.all()) - print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}") else: pytest.raises(TypeError, buf.__dlpack__) From 5f441c2767938c214b02f7d01f5ac0a794c3274c Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 20 Oct 2021 18:04:44 +0000 Subject: [PATCH 23/60] harmonize method names like 'convert_to_cudf_string' --- python/cudf/cudf/core/df_protocol.py | 38 +++++++++++++--------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 905a9f8695c..fb5bf8434fb 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -4,7 +4,7 @@ Public API ---------- -from_dataframe : construct a pandas.DataFrame from an input data frame which +from_dataframe : construct a cudf.DataFrame from an input data frame which implements the exchange protocol Notes @@ -58,8 +58,7 @@ def _from_dataframe(df : DataFrameObject) : if df.num_chunks() > 1: raise NotImplementedError - # We need a dict of columns here, with each column being a numpy array (at - # least for now, deal with non-numpy dtypes later). + # We need a dict of columns here, with each column being a cudf column column. columns = dict() _k = _DtypeKind _buffers = [] # hold on to buffers, keeps memory alive @@ -67,7 +66,7 @@ def _from_dataframe(df : DataFrameObject) : col = df.get_column_by_name(name) if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - columns[name], _buf = convert_to_cudf_column(col) + columns[name], _buf = _protocol_column_to_cudf_column_numeric(col) elif col.dtype[0] == _k.CATEGORICAL: columns[name], _buf = convert_to_cudf_categorical(col) @@ -95,22 +94,21 @@ class _DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 - -def convert_to_cudf_column(col:ColumnObject) -> cp.ndarray: +def _protocol_column_to_cudf_column_numeric(col:ColumnObject): """ - Convert an int, uint, float or bool column to a numpy array + Convert an int, uint, float or bool protocol column to the corresponding cudf column """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") _dbuffer, _ddtype = col.get_buffers()['data'] - check_data_is_on_gpu(_dbuffer) + _check_data_is_on_gpu(_dbuffer) dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtypes_to_cupy_dtype(_ddtype)) - return set_missing_values(col, dcol), _dbuffer + return _set_missing_values(col, dcol), _dbuffer + -def check_data_is_on_gpu(buffer): - +def _check_data_is_on_gpu(buffer): if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU." "Set `allow_copy=True` to allow it.") @@ -126,15 +124,15 @@ def buffer_to_cupy_ndarray(_buffer, _dtype) -> cp.ndarray: return x -def set_missing_values(col, dcol): - null_kind, null_value = col.describe_null +def _set_missing_values(protocol_col, cudf_col): + null_kind, null_value = protocol_col.describe_null if null_kind != 0: assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." - _mask_buffer, _mask_dtype = col.get_buffers()["validity"] + _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) - dcol[~bitmask] = None + cudf_col[~bitmask] = None - return dcol + return cudf_col def _gpu_buffer_to_cupy(_buffer, _dtype): _k = _DtypeKind @@ -187,14 +185,14 @@ def convert_to_cudf_categorical(col : ColumnObject) : categories = as_column(mapping.values()) codes_buffer, codes_dtype = col.get_buffers()['data'] - check_data_is_on_gpu(codes_buffer) + _check_data_is_on_gpu(codes_buffer) cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype) codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, size=codes.size,ordered=ordered) - return set_missing_values(col, col1), codes_buffer + return _set_missing_values(col, col1), codes_buffer def convert_to_cudf_string(col : ColumnObject) : @@ -206,7 +204,7 @@ def convert_to_cudf_string(col : ColumnObject) : # Retrieve the data buffer containing the UTF-8 code units dbuffer, bdtype = buffers["data"] - check_data_is_on_gpu(dbuffer) + _check_data_is_on_gpu(dbuffer) encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize), protocol_dtypes_to_cupy_dtype(bdtype) ) @@ -219,7 +217,7 @@ def convert_to_cudf_string(col : ColumnObject) : col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) - return set_missing_values(col, col_str), buffers + return _set_missing_values(col, col_str), buffers From 78741a936dc25bff42e1dd21ff39501c4a6ed7eb Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 22 Oct 2021 08:43:58 +0000 Subject: [PATCH 24/60] Do the same for 'convert_to_cudf_categorical' --- python/cudf/cudf/core/df_protocol.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index fb5bf8434fb..89e67d60e76 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -69,10 +69,10 @@ def _from_dataframe(df : DataFrameObject) : columns[name], _buf = _protocol_column_to_cudf_column_numeric(col) elif col.dtype[0] == _k.CATEGORICAL: - columns[name], _buf = convert_to_cudf_categorical(col) + columns[name], _buf = _protocol_column_to_cudf_column_categorical(col) elif col.dtype[0] == _k.STRING: - columns[name], _buf = convert_to_cudf_string(col) + columns[name], _buf = _protocol_column_to_cudf_column_string(col) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") @@ -175,7 +175,7 @@ def _cpu_buffer_to_cupy(_buffer, _dtype): return cp.asarray(x, dtype=column_dtype) -def convert_to_cudf_categorical(col : ColumnObject) : +def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : """ Convert a categorical column to a Series instance """ @@ -195,7 +195,7 @@ def convert_to_cudf_categorical(col : ColumnObject) : return _set_missing_values(col, col1), codes_buffer -def convert_to_cudf_string(col : ColumnObject) : +def _protocol_column_to_cudf_column_string(col : ColumnObject) : """ Convert a string ColumnObject to cudf Column object. """ From 3cc229b322b99402b59c6d10cd3166a35fe0bc3f Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 22 Oct 2021 08:51:53 +0000 Subject: [PATCH 25/60] remove unused methods --- python/cudf/cudf/core/df_protocol.py | 41 ++-------------------------- 1 file changed, 3 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 89e67d60e76..49808fee17f 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -94,6 +94,9 @@ class _DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 + + + def _protocol_column_to_cudf_column_numeric(col:ColumnObject): """ Convert an int, uint, float or bool protocol column to the corresponding cudf column @@ -113,17 +116,6 @@ def _check_data_is_on_gpu(buffer): raise TypeError("This operation must copy data from CPU to GPU." "Set `allow_copy=True` to allow it.") -def buffer_to_cupy_ndarray(_buffer, _dtype) -> cp.ndarray: - if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA - x = _gpu_buffer_to_cupy(_buffer, _dtype) - else: - if not _buffer._allow_copy: - raise TypeError("This operation must copy data from CPU to GPU." - "Set `allow_copy=True` to allow it.") - x = _cpu_buffer_to_cupy(_buffer, _dtype) - - return x - def _set_missing_values(protocol_col, cudf_col): null_kind, null_value = protocol_col.describe_null if null_kind != 0: @@ -134,16 +126,6 @@ def _set_missing_values(protocol_col, cudf_col): return cudf_col -def _gpu_buffer_to_cupy(_buffer, _dtype): - _k = _DtypeKind - if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL): - x = cp.fromDlpack(_buffer.__dlpack__()) - elif _dtype[0] == _k.BOOL: - x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_) - else: - raise NotImplementedError(f"Data type {_dtype[0]} not handled yet") - return x - def protocol_dtypes_to_cupy_dtype(_dtype): kind = _dtype[0] bitwidth = _dtype[1] @@ -158,23 +140,6 @@ def protocol_dtypes_to_cupy_dtype(_dtype): _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} return _cp_dtypes[kind][bitwidth] -def _cpu_buffer_to_cupy(_buffer, _dtype): - # Handle the dtype - - column_dtype = protocol_dtypes_to_cupy_dtype(_dtype) - # No DLPack yet, so need to construct a new ndarray from the data pointer - # and size in the buffer plus the dtype on the column - ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) - - # NOTE: `x` does not own its memory, so the caller of this function must - # either make a copy or hold on to a reference of the column or - # buffer! (not done yet, this is pretty awful ...) - x = np.ctypeslib.as_array(data_pointer, - shape=(_buffer.bufsize // (bitwidth//8),)) - return cp.asarray(x, dtype=column_dtype) - - def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : """ Convert a categorical column to a Series instance From 6fa456680dbb5c531eacc73a962f407fb35dd780 Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 22 Oct 2021 09:23:57 +0000 Subject: [PATCH 26/60] reorganize code so that class mentions occur after their definitions --- python/cudf/cudf/core/df_protocol.py | 379 +++++++++++++-------------- 1 file changed, 189 insertions(+), 190 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 49808fee17f..9a4f052b598 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,23 +1,3 @@ -""" -Implementation of the dataframe exchange protocol. - -Public API ----------- - -from_dataframe : construct a cudf.DataFrame from an input data frame which - implements the exchange protocol - -Notes ------ - -- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to - do in pure Python. It's more general but definitely less friendly than having - ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack - ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), - this is worth looking at again. - -""" - import enum import collections import ctypes @@ -31,59 +11,9 @@ from numba import cuda -# A typing protocol could be added later to let Mypy validate code using -# `from_dataframe` better. -DataFrameObject = Any -ColumnObject = Any - - -def from_dataframe(df : DataFrameObject, allow_copy: bool = False) : - """ - Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` - """ - if isinstance(df, cudf.DataFrame): - return df - - if not hasattr(df, '__dataframe__'): - raise ValueError("`df` does not support __dataframe__") - - return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) - - -def _from_dataframe(df : DataFrameObject) : - """ - Create a cudf DataFrame object from DataFrameObject. - """ - # Check number of chunks, if there's more than one we need to iterate - if df.num_chunks() > 1: - raise NotImplementedError - - # We need a dict of columns here, with each column being a cudf column column. - columns = dict() - _k = _DtypeKind - _buffers = [] # hold on to buffers, keeps memory alive - for name in df.column_names(): - col = df.get_column_by_name(name) - - if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - columns[name], _buf = _protocol_column_to_cudf_column_numeric(col) - - elif col.dtype[0] == _k.CATEGORICAL: - columns[name], _buf = _protocol_column_to_cudf_column_categorical(col) - - elif col.dtype[0] == _k.STRING: - columns[name], _buf = _protocol_column_to_cudf_column_string(col) - - else: - raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") - - _buffers.append(_buf) - - df_new = cudf.DataFrame._from_data(columns) - df_new._buffers = _buffers - return df_new - +# Implementation of interchange protocol classes +# ---------------------------------------------- class _DtypeKind(enum.IntEnum): INT = 0 @@ -93,125 +23,7 @@ class _DtypeKind(enum.IntEnum): STRING = 21 # UTF-8 DATETIME = 22 CATEGORICAL = 23 - - - - -def _protocol_column_to_cudf_column_numeric(col:ColumnObject): - """ - Convert an int, uint, float or bool protocol column to the corresponding cudf column - """ - if col.offset != 0: - raise NotImplementedError("column.offset > 0 not handled yet") - - _dbuffer, _ddtype = col.get_buffers()['data'] - _check_data_is_on_gpu(_dbuffer) - dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), - protocol_dtypes_to_cupy_dtype(_ddtype)) - return _set_missing_values(col, dcol), _dbuffer - - -def _check_data_is_on_gpu(buffer): - if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy: - raise TypeError("This operation must copy data from CPU to GPU." - "Set `allow_copy=True` to allow it.") - -def _set_missing_values(protocol_col, cudf_col): - null_kind, null_value = protocol_col.describe_null - if null_kind != 0: - assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." - _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] - bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) - cudf_col[~bitmask] = None - - return cudf_col - -def protocol_dtypes_to_cupy_dtype(_dtype): - kind = _dtype[0] - bitwidth = _dtype[1] - _k = _DtypeKind - if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL, - _k.STRING, _k.DATETIME): - raise RuntimeError(f"Data type {_dtype[0]} not handled yet") - - _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} - _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} - _floats = {32: cp.float32, 64: cp.float64} - _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} - return _cp_dtypes[kind][bitwidth] - -def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : - """ - Convert a categorical column to a Series instance - """ - ordered, is_dict, mapping = col.describe_categorical - if not is_dict: - raise NotImplementedError('Non-dictionary categoricals not supported yet') - - categories = as_column(mapping.values()) - codes_buffer, codes_dtype = col.get_buffers()['data'] - _check_data_is_on_gpu(codes_buffer) - cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype) - codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) - col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, - size=codes.size,ordered=ordered) - - return _set_missing_values(col, col1), codes_buffer - - -def _protocol_column_to_cudf_column_string(col : ColumnObject) : - """ - Convert a string ColumnObject to cudf Column object. - """ - # Retrieve the data buffers - buffers = col.get_buffers() - - # Retrieve the data buffer containing the UTF-8 code units - dbuffer, bdtype = buffers["data"] - _check_data_is_on_gpu(dbuffer) - encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize), - protocol_dtypes_to_cupy_dtype(bdtype) - ) - - # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = buffers["offsets"] - offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), - protocol_dtypes_to_cupy_dtype(odtype) - ) - - col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) - - return _set_missing_values(col, col_str), buffers - - - -def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> dict: - """ - The public method to attach to cudf.DataFrame. - - We'll attach it via monkey-patching here for demo purposes. If Pandas adopts - the protocol, this will be a regular method on pandas.DataFrame. - - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - This currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this protocol - specifies contiguous buffers. - Currently, if the flag is set to ``False`` and a copy is needed, a - ``RuntimeError`` will be raised. - """ - return _CuDFDataFrame( - self, nan_as_null=nan_as_null, allow_copy=allow_copy) - - -# Implementation of interchange protocol -# -------------------------------------- class _CuDFBuffer: """ @@ -672,3 +484,190 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram Return an iterator yielding the chunks. """ return (self,) + + +""" +Implementation of the dataframe exchange protocol. + +Public API +---------- + +from_dataframe : construct a cudf.DataFrame from an input data frame which + implements the exchange protocol + +Notes +----- + +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to + do in pure Python. It's more general but definitely less friendly than having + ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack + ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), + this is worth looking at again. + +""" + + +# A typing protocol could be added later to let Mypy validate code using +# `from_dataframe` better. +DataFrameObject = Any +ColumnObject = Any + + +def from_dataframe(df : DataFrameObject, allow_copy: bool = False) : + """ + Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` + """ + if isinstance(df, cudf.DataFrame): + return df + + if not hasattr(df, '__dataframe__'): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + + +def _from_dataframe(df : DataFrameObject) : + """ + Create a cudf DataFrame object from DataFrameObject. + """ + # Check number of chunks, if there's more than one we need to iterate + if df.num_chunks() > 1: + raise NotImplementedError + + # We need a dict of columns here, with each column being a cudf column column. + columns = dict() + _k = _DtypeKind + _buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + col = df.get_column_by_name(name) + + if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + columns[name], _buf = _protocol_column_to_cudf_column_numeric(col) + + elif col.dtype[0] == _k.CATEGORICAL: + columns[name], _buf = _protocol_column_to_cudf_column_categorical(col) + + elif col.dtype[0] == _k.STRING: + columns[name], _buf = _protocol_column_to_cudf_column_string(col) + + else: + raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") + + _buffers.append(_buf) + + df_new = cudf.DataFrame._from_data(columns) + df_new._buffers = _buffers + return df_new + + +def _protocol_column_to_cudf_column_numeric(col:ColumnObject): + """ + Convert an int, uint, float or bool protocol column to the corresponding cudf column + """ + if col.offset != 0: + raise NotImplementedError("column.offset > 0 not handled yet") + + _dbuffer, _ddtype = col.get_buffers()['data'] + _check_data_is_on_gpu(_dbuffer) + dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), + protocol_dtypes_to_cupy_dtype(_ddtype)) + return _set_missing_values(col, dcol), _dbuffer + + +def _check_data_is_on_gpu(buffer): + if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy: + raise TypeError("This operation must copy data from CPU to GPU." + "Set `allow_copy=True` to allow it.") + +def _set_missing_values(protocol_col, cudf_col): + null_kind, null_value = protocol_col.describe_null + if null_kind != 0: + assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." + _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] + bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) + cudf_col[~bitmask] = None + + return cudf_col + +def protocol_dtypes_to_cupy_dtype(_dtype): + kind = _dtype[0] + bitwidth = _dtype[1] + _k = _DtypeKind + if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL, + _k.STRING, _k.DATETIME): + raise RuntimeError(f"Data type {_dtype[0]} not handled yet") + + _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} + _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} + _floats = {32: cp.float32, 64: cp.float64} + _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} + return _cp_dtypes[kind][bitwidth] + +def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : + """ + Convert a categorical column to a Series instance + """ + ordered, is_dict, mapping = col.describe_categorical + if not is_dict: + raise NotImplementedError('Non-dictionary categoricals not supported yet') + + categories = as_column(mapping.values()) + codes_buffer, codes_dtype = col.get_buffers()['data'] + _check_data_is_on_gpu(codes_buffer) + cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype) + codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) + + col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, + size=codes.size,ordered=ordered) + + return _set_missing_values(col, col1), codes_buffer + + +def _protocol_column_to_cudf_column_string(col : ColumnObject) : + """ + Convert a string ColumnObject to cudf Column object. + """ + # Retrieve the data buffers + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + dbuffer, bdtype = buffers["data"] + _check_data_is_on_gpu(dbuffer) + encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize), + protocol_dtypes_to_cupy_dtype(bdtype) + ) + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + obuffer, odtype = buffers["offsets"] + offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), + protocol_dtypes_to_cupy_dtype(odtype) + ) + + col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) + + return _set_missing_values(col, col_str), buffers + + + +def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True) -> dict: + """ + The public method to attach to cudf.DataFrame. + + We'll attach it via monkey-patching here for demo purposes. If Pandas adopts + the protocol, this will be a regular method on pandas.DataFrame. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + Currently, if the flag is set to ``False`` and a copy is needed, a + ``RuntimeError`` will be raised. + """ + return _CuDFDataFrame( + self, nan_as_null=nan_as_null, allow_copy=allow_copy) \ No newline at end of file From c0f2bc347f7bcbe86395656dac76fbe3a40ac96a Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 22 Oct 2021 09:29:20 +0000 Subject: [PATCH 27/60] replace '_ints' and similar to module level constants --- python/cudf/cudf/core/df_protocol.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 9a4f052b598..feceae5acaf 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -513,6 +513,12 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram ColumnObject = Any +_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} +_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} +_FLOATS = {32: cp.float32, 64: cp.float64} +_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}} + + def from_dataframe(df : DataFrameObject, allow_copy: bool = False) : """ Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` @@ -596,12 +602,8 @@ def protocol_dtypes_to_cupy_dtype(_dtype): if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL, _k.STRING, _k.DATETIME): raise RuntimeError(f"Data type {_dtype[0]} not handled yet") - - _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} - _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} - _floats = {32: cp.float32, 64: cp.float64} - _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} - return _cp_dtypes[kind][bitwidth] + + return _CP_DTYPES[kind][bitwidth] def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : """ From 3a3a9dc0ec394ee0dcfe4cf9f7ee47d6aab9e5a6 Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 22 Oct 2021 15:07:54 +0000 Subject: [PATCH 28/60] define module level Device class and remove device check with number (2) --- python/cudf/cudf/core/df_protocol.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index feceae5acaf..1564d380108 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -12,6 +12,8 @@ + + # Implementation of interchange protocol classes # ---------------------------------------------- @@ -23,7 +25,17 @@ class _DtypeKind(enum.IntEnum): STRING = 21 # UTF-8 DATETIME = 22 CATEGORICAL = 23 - + +class Device(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + class _CuDFBuffer: """ @@ -61,6 +73,7 @@ def __dlpack__(self): try: cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype) res = cp.asarray(cudarray).toDlpack() + except ValueError: raise TypeError(f'dtype {self._cudf_dtype} unsupported by `dlpack`') @@ -70,9 +83,6 @@ def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: """ Device type and device ID for where the data in the buffer resides. """ - class Device(enum.IntEnum): - CUDA = 2 - return (Device.CUDA, cp.asarray(self._buf).device.id) def __repr__(self) -> str: @@ -485,7 +495,6 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram """ return (self,) - """ Implementation of the dataframe exchange protocol. @@ -519,7 +528,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram _CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}} -def from_dataframe(df : DataFrameObject, allow_copy: bool = False) : +def from_dataframe(df : DataFrameObject, allow_copy: bool = False) -> _CuDFDataFrame : """ Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -538,7 +547,7 @@ def _from_dataframe(df : DataFrameObject) : """ # Check number of chunks, if there's more than one we need to iterate if df.num_chunks() > 1: - raise NotImplementedError + raise NotImplementedError("More than one chunk not handled yet") # We need a dict of columns here, with each column being a cudf column column. columns = dict() @@ -581,7 +590,7 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject): def _check_data_is_on_gpu(buffer): - if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy: + if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU." "Set `allow_copy=True` to allow it.") @@ -672,4 +681,5 @@ def __dataframe__(self, nan_as_null : bool = False, ``RuntimeError`` will be raised. """ return _CuDFDataFrame( - self, nan_as_null=nan_as_null, allow_copy=allow_copy) \ No newline at end of file + self, nan_as_null=nan_as_null, allow_copy=allow_copy) + From 7ac4f27d1b30d51fd2856dded05aec24bb6ae0fe Mon Sep 17 00:00:00 2001 From: iskode Date: Fri, 22 Oct 2021 15:30:47 +0000 Subject: [PATCH 29/60] rename methods + annotate return types --- python/cudf/cudf/core/df_protocol.py | 33 ++++++++++++---------- python/cudf/cudf/tests/test_df_protocol.py | 6 ++-- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 1564d380108..7524d5af317 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -541,7 +541,7 @@ def from_dataframe(df : DataFrameObject, allow_copy: bool = False) -> _CuDFDataF return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df : DataFrameObject) : +def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : """ Create a cudf DataFrame object from DataFrameObject. """ @@ -575,7 +575,8 @@ def _from_dataframe(df : DataFrameObject) : return df_new -def _protocol_column_to_cudf_column_numeric(col:ColumnObject): +def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \ + Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]: """ Convert an int, uint, float or bool protocol column to the corresponding cudf column """ @@ -585,26 +586,26 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject): _dbuffer, _ddtype = col.get_buffers()['data'] _check_data_is_on_gpu(_dbuffer) dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), - protocol_dtypes_to_cupy_dtype(_ddtype)) + protocol_dtype_to_cupy_dtype(_ddtype)) return _set_missing_values(col, dcol), _dbuffer -def _check_data_is_on_gpu(buffer): +def _check_data_is_on_gpu(buffer) -> None: if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU." "Set `allow_copy=True` to allow it.") -def _set_missing_values(protocol_col, cudf_col): +def _set_missing_values(protocol_col, cudf_col) -> cudf.core.column.ColumnBase: null_kind, null_value = protocol_col.describe_null if null_kind != 0: - assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." + assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) cudf_col[~bitmask] = None return cudf_col -def protocol_dtypes_to_cupy_dtype(_dtype): +def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype: kind = _dtype[0] bitwidth = _dtype[1] _k = _DtypeKind @@ -614,7 +615,8 @@ def protocol_dtypes_to_cupy_dtype(_dtype): return _CP_DTYPES[kind][bitwidth] -def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : +def _protocol_column_to_cudf_column_categorical(col : ColumnObject) -> \ + Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] : """ Convert a categorical column to a Series instance """ @@ -625,16 +627,17 @@ def _protocol_column_to_cudf_column_categorical(col : ColumnObject) : categories = as_column(mapping.values()) codes_buffer, codes_dtype = col.get_buffers()['data'] _check_data_is_on_gpu(codes_buffer) - cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype) + cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) - col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, + cudfcol = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, size=codes.size,ordered=ordered) - return _set_missing_values(col, col1), codes_buffer + return _set_missing_values(col, cudfcol), codes_buffer -def _protocol_column_to_cudf_column_string(col : ColumnObject) : +def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \ + Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] : """ Convert a string ColumnObject to cudf Column object. """ @@ -645,13 +648,13 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) : dbuffer, bdtype = buffers["data"] _check_data_is_on_gpu(dbuffer) encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize), - protocol_dtypes_to_cupy_dtype(bdtype) + protocol_dtype_to_cupy_dtype(bdtype) ) # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string obuffer, odtype = buffers["offsets"] offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), - protocol_dtypes_to_cupy_dtype(odtype) + protocol_dtype_to_cupy_dtype(odtype) ) col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) @@ -661,7 +664,7 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) : def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> dict: + allow_copy : bool = True) -> _CuDFDataFrame: """ The public method to attach to cudf.DataFrame. diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 2fe1652750d..3b697d2d602 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -5,9 +5,7 @@ from cudf.core.df_protocol import ( _from_dataframe, _DtypeKind, - protocol_dtypes_to_cupy_dtype, - - + protocol_dtype_to_cupy_dtype, _CuDFDataFrame, _CuDFColumn, _CuDFBuffer @@ -36,7 +34,7 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize), - protocol_dtypes_to_cupy_dtype(dtype) + protocol_dtype_to_cupy_dtype(dtype) ) # check that non null values are the equals as null are represented # by sentinel values in the buffer. From c572c646503c437faa31cbcf2ef4277b63b3ce98 Mon Sep 17 00:00:00 2001 From: iskode Date: Sat, 23 Oct 2021 11:42:43 +0000 Subject: [PATCH 30/60] add type annotations --- python/cudf/cudf/core/df_protocol.py | 74 +++++++++++++++------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 7524d5af317..fe883c86be4 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -42,9 +42,10 @@ class _CuDFBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, buf : Buffer, cudf_dtype, allow_copy : bool = True) -> None: + def __init__(self, buf : cudf.core.buffer.Buffer, + cudf_dtype: cp.dtype, allow_copy : bool = True) -> None: """ - Use cudf Buffer object. + Use cudf.core.buffer.Buffer object. """ # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -66,7 +67,7 @@ def ptr(self) -> int: """ return self._buf.ptr - def __dlpack__(self): + def __dlpack__(self) : """ DLPack not implemented in NumPy yet, so leave it out here. """ @@ -107,7 +108,7 @@ class _CuDFColumn: """ - def __init__(self, column, + def __init__(self, column: cudf.core.column.ColumnBase, nan_as_null : bool = True, allow_copy: bool = True) -> None: """ @@ -217,7 +218,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: return (kind, bitwidth, format_str, endianness) @property - def describe_categorical(self) -> Tuple[Any, bool, Dict[int, Any]]: + def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: """ If the dtype is categorical, there are two options: @@ -313,7 +314,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn'] """ return (self,) - def get_buffers(self) -> Dict[str, Any]: + def get_buffers(self) -> Dict[str, _CuDFBuffer]: """ Return a dictionary containing the underlying buffers. @@ -432,7 +433,8 @@ class _CuDFDataFrame: ``cudf.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. """ - def __init__(self, df, nan_as_null : bool = True, + def __init__(self, df : 'cudf.core.dataframe.DataFrame', + nan_as_null : bool = True, allow_copy : bool = True) -> None: """ Constructor - an instance of this (private) class is returned from @@ -495,6 +497,31 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram """ return (self,) + +def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True) -> _CuDFDataFrame: + """ + The public method to attach to cudf.DataFrame. + + We'll attach it via monkey-patching here for demo purposes. If Pandas adopts + the protocol, this will be a regular method on pandas.DataFrame. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + Currently, if the flag is set to ``False`` and a copy is needed, a + ``RuntimeError`` will be raised. + """ + return _CuDFDataFrame( + self, nan_as_null=nan_as_null, allow_copy=allow_copy) + + """ Implementation of the dataframe exchange protocol. @@ -590,12 +617,15 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \ return _set_missing_values(col, dcol), _dbuffer -def _check_data_is_on_gpu(buffer) -> None: +def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None: if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU." "Set `allow_copy=True` to allow it.") -def _set_missing_values(protocol_col, cudf_col) -> cudf.core.column.ColumnBase: +def _set_missing_values(protocol_col: _CuDFColumn, + cudf_col:'cudf.core.dataframe.DataFrame') \ + -> cudf.core.column.ColumnBase: + null_kind, null_value = protocol_col.describe_null if null_kind != 0: assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." @@ -660,29 +690,3 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \ col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) return _set_missing_values(col, col_str), buffers - - - -def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> _CuDFDataFrame: - """ - The public method to attach to cudf.DataFrame. - - We'll attach it via monkey-patching here for demo purposes. If Pandas adopts - the protocol, this will be a regular method on pandas.DataFrame. - - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - This currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this protocol - specifies contiguous buffers. - Currently, if the flag is set to ``False`` and a copy is needed, a - ``RuntimeError`` will be raised. - """ - return _CuDFDataFrame( - self, nan_as_null=nan_as_null, allow_copy=allow_copy) - From c60cf5b8e42bec051c616855dfeb6c96960b2708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Fri, 29 Oct 2021 20:45:16 +0000 Subject: [PATCH 31/60] correct '__dlpack_device__' annotation Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index fe883c86be4..b4c103eb260 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -80,7 +80,7 @@ def __dlpack__(self) : return res - def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: + def __dlpack_device__(self) -> Tuple[_Device, int]: """ Device type and device ID for where the data in the buffer resides. """ From 4262fc74386aedd5e1201396d3956a02d3d99458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Fri, 29 Oct 2021 20:46:11 +0000 Subject: [PATCH 32/60] correct 'dtype' method annotation Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index b4c103eb260..46f49e0b6e7 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -139,7 +139,7 @@ def offset(self) -> int: return 0 @property - def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: + def dtype(self) -> Tuple[_DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` From a7fe2876f046b2accf44b81411974c8393c1ecef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Mon, 1 Nov 2021 13:31:18 +0100 Subject: [PATCH 33/60] mark 'Device' class as private Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 46f49e0b6e7..1354c224176 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -26,7 +26,7 @@ class _DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 -class Device(enum.IntEnum): +class _Device(enum.IntEnum): CPU = 1 CUDA = 2 CPU_PINNED = 3 From f5aef739370b69ebf4b30bdca6ce0eb998d88598 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Mon, 1 Nov 2021 13:49:58 +0100 Subject: [PATCH 34/60] Apply suggestions from @bdice code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 31 ++++++++++------------ python/cudf/cudf/tests/test_df_protocol.py | 4 +-- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 1354c224176..e6584e47326 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -43,7 +43,7 @@ class _CuDFBuffer: """ def __init__(self, buf : cudf.core.buffer.Buffer, - cudf_dtype: cp.dtype, allow_copy : bool = True) -> None: + dtype: np.dtype, allow_copy : bool = True) -> None: """ Use cudf.core.buffer.Buffer object. """ @@ -87,7 +87,7 @@ def __dlpack_device__(self) -> Tuple[_Device, int]: return (Device.CUDA, cp.asarray(self._buf).device.id) def __repr__(self) -> str: - return 'CuDFBuffer(' + str({'bufsize': self.bufsize, + return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize, 'ptr': self.ptr, 'dlpack': self.__dlpack__(), 'device': self.__dlpack_device__()[0].name} @@ -186,7 +186,7 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]: return self._dtype_from_cudfdtype(dtype) - def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: + def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: """ See `self.dtype` for details. """ @@ -214,7 +214,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: bitwidth = dtype.itemsize * 8 format_str = dtype.str - endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '=' + endianness = dtype.byteorder if kind != _k.CATEGORICAL else '=' return (kind, bitwidth, format_str, endianness) @property @@ -225,7 +225,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: - There are only values in the data buffer. - There is a separate dictionary-style encoding for categorical values. - Raises RuntimeError if the dtype is not categorical + Raises TypeError if the dtype is not categorical Content of returned dict: @@ -269,18 +269,15 @@ def describe_null(self) -> Tuple[int, Any]: otherwise. """ if self.null_count == 0: - # there is no validity mask in this case - # so making it non-nullable (hackingly) - null = 0 - value = None + # there is no validity mask so it is non-nullable + return 0, None else : _k = _DtypeKind kind = self.dtype[0] # bit mask is universally used in cudf for missing if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, _k.BOOL, _k.STRING, _k.DATETIME): - null = 3 - value = 0 + return 3, 0 else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") @@ -291,7 +288,7 @@ def null_count(self) -> int: """ Number of null elements. Should always be known. """ - return self._col.isna().sum() + return self._col.null_count @property def metadata(self) -> Dict[str, Any]: @@ -308,7 +305,7 @@ def num_chunks(self) -> int: def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']: """ - Return an iterator yielding the chunks. + Return an iterable yielding the chunks. See `DataFrame.get_chunks` for details on ``n_chunks``. """ @@ -479,7 +476,7 @@ def get_columns(self) -> Iterable[_CuDFColumn]: for name in self._df.columns] def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame': - if not isinstance(indices, collections.Sequence): + if not isinstance(indices, collections.abc.Sequence): raise ValueError("`indices` is not a sequence") return _CuDFDataFrame(self._df.iloc[:, indices]) @@ -576,7 +573,7 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : if df.num_chunks() > 1: raise NotImplementedError("More than one chunk not handled yet") - # We need a dict of columns here, with each column being a cudf column column. + # We need a dict of columns here, with each column being a cudf column. columns = dict() _k = _DtypeKind _buffers = [] # hold on to buffers, keeps memory alive @@ -619,7 +616,7 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \ def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None: if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy: - raise TypeError("This operation must copy data from CPU to GPU." + raise TypeError("This operation must copy data from CPU to GPU. " "Set `allow_copy=True` to allow it.") def _set_missing_values(protocol_col: _CuDFColumn, @@ -627,7 +624,7 @@ def _set_missing_values(protocol_col: _CuDFColumn, -> cudf.core.column.ColumnBase: null_kind, null_value = protocol_col.describe_null - if null_kind != 0: + if null_kind != 0: assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 3b697d2d602..923756e9d06 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -38,7 +38,7 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): ) # check that non null values are the equals as null are represented # by sentinel values in the buffer. - non_null_idxs = cudfcol!=None + non_null_idxs = cudfcol is not None assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) if dtype[0] != _DtypeKind.BOOL: @@ -184,7 +184,7 @@ def test_NA_categorical_dtype(): col = df.__dataframe__().get_column_by_name('B') assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.null_count == 2 - assert col.describe_null == (3, 0) # sentinel value -1 + assert col.describe_null == (3, 0) assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) From c47ce43390e2820a4f63cc8d8ed31db4e908e80e Mon Sep 17 00:00:00 2001 From: iskode Date: Mon, 1 Nov 2021 13:36:35 +0000 Subject: [PATCH 35/60] fix test errors due to changes + remove commented code --- python/cudf/cudf/core/df_protocol.py | 33 +++++++++++----------- python/cudf/cudf/tests/test_df_protocol.py | 18 ++++-------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index e6584e47326..b9c3efac707 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -36,6 +36,9 @@ class _Device(enum.IntEnum): VPI = 9 ROCM = 10 +_k = _DtypeKind +SUPPORTED_DTYPE = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, + _k.BOOL, _k.STRING) class _CuDFBuffer: """ @@ -50,7 +53,7 @@ def __init__(self, buf : cudf.core.buffer.Buffer, # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes self._buf = buf - self._cudf_dtype = cudf_dtype + self._dtype = dtype self._allow_copy = allow_copy @property @@ -72,19 +75,19 @@ def __dlpack__(self) : DLPack not implemented in NumPy yet, so leave it out here. """ try: - cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype) + cudarray = cuda.as_cuda_array(self._buf).view(self._dtype) res = cp.asarray(cudarray).toDlpack() except ValueError: - raise TypeError(f'dtype {self._cudf_dtype} unsupported by `dlpack`') + raise TypeError(f'dtype {self._dtype} unsupported by `dlpack`') return res def __dlpack_device__(self) -> Tuple[_Device, int]: """ - Device type and device ID for where the data in the buffer resides. + _Device type and _Device ID for where the data in the buffer resides. """ - return (Device.CUDA, cp.asarray(self._buf).device.id) + return (_Device.CUDA, cp.asarray(self._buf).device.id) def __repr__(self) -> str: return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize, @@ -268,20 +271,18 @@ def describe_null(self) -> Tuple[int, Any]: mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. """ + _k = _DtypeKind + kind = self.dtype[0] if self.null_count == 0: # there is no validity mask so it is non-nullable return 0, None - else : - _k = _DtypeKind - kind = self.dtype[0] - # bit mask is universally used in cudf for missing - if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, - _k.BOOL, _k.STRING, _k.DATETIME): - return 3, 0 - else: - raise NotImplementedError(f"Data type {self.dtype} not yet supported") - return null, value + elif kind in SUPPORTED_DTYPE: + # bit mask is universally used in cudf for missing + return 3, 0 + + else: + raise NotImplementedError(f"Data type {self.dtype} not yet supported") @property def null_count(self) -> int: @@ -615,7 +616,7 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \ def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None: - if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy: + if buffer.__dlpack_device__()[0] != _Device.CUDA and not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU. " "Set `allow_copy=True` to allow it.") diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 923756e9d06..dda608cf064 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -29,8 +29,8 @@ DataFrameObject = Any -def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): - buf, dtype = buffer_dtype +def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): + buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize), @@ -38,13 +38,14 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): ) # check that non null values are the equals as null are represented # by sentinel values in the buffer. - non_null_idxs = cudfcol is not None + non_null_idxs = cudfcol != None + print(non_null_idxs, cudfcol is not None) assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) if dtype[0] != _DtypeKind.BOOL: array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) col_array = cp.asarray(cudfcol.data_array_view) - assert_eq(array_from_dlpack.all(), col_array.all()) + assert_eq(array_from_dlpack.flatten(), col_array.flatten()) else: pytest.raises(TypeError, buf.__dlpack__) @@ -54,7 +55,7 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol): def assert_column_equal(col: _CuDFColumn, cudfcol): assert col.size == cudfcol.size assert col.offset == 0 - assert col.null_count == cudfcol.isna().sum() + assert col.null_count == cudfcol.null_count assert col.num_chunks() == 1 if col.null_count == 0 : pytest.raises(RuntimeError, col._get_validity_buffer) @@ -117,12 +118,6 @@ def _test_datatype(data): _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) - # pdf = pd.DataFrame(data=data) - # cpu_dfobj = _CuDFDataFrame(pdf) - # _test_from_dataframe_exception(cpu_dfobj) - # _test_from_dataframe_equals(cpu_dfobj, allow_copy=True) - - def test_from_dataframe(): data = dict(a=[1, 2, 3], b=[9, 10, 11]) df1 = cudf.DataFrame(data=data) @@ -154,7 +149,6 @@ def test_bool_dtype(): def test_string_dtype(): data_string = dict(a=["a", "b", "cdef", "", "g"]) _test_datatype(data_string) - def test_mixed_dtype(): From 139ca5adfc9004ba12a2502f4fb0f019022401dc Mon Sep 17 00:00:00 2001 From: iskode Date: Mon, 1 Nov 2021 13:42:44 +0000 Subject: [PATCH 36/60] add string column to mixed type test + a mixed type test case with NA --- python/cudf/cudf/tests/test_df_protocol.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index dda608cf064..ac99943b671 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -153,7 +153,8 @@ def test_string_dtype(): def test_mixed_dtype(): data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5], - bool=[True, False, True], categorical=[5, 1, 5]) + bool=[True, False, True], categorical=[5, 1, 5], + string=["rapidsai-cudf ", "", "df protocol"]) _test_datatype(data_mixed) @@ -201,3 +202,12 @@ def test_NA_string_dtype(): assert col.num_chunks() == 1 _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + + +def test_NA_mixed_dtype(): + data_mixed = dict(int=[1, None, 2, 3, 1000], float=[None, 1.5, 2.5, 3.5, None], + bool=[True, None, False, None, None], + categorical=[5, 1, 5, 3, None], + string=[None, None, None, "df protocol", None]) + _test_datatype(data_mixed) + From bcec52c36654879088499aec42dfbad39186df68 Mon Sep 17 00:00:00 2001 From: iskode Date: Mon, 1 Nov 2021 14:55:58 +0000 Subject: [PATCH 37/60] address remaining suggestions from @bdice --- python/cudf/cudf/core/df_protocol.py | 45 ++++++++++------------ python/cudf/cudf/tests/test_df_protocol.py | 1 - 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index b9c3efac707..d0b44ea597f 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -37,7 +37,7 @@ class _Device(enum.IntEnum): ROCM = 10 _k = _DtypeKind -SUPPORTED_DTYPE = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, +_SUPPORTED_KINDS = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, _k.BOOL, _k.STRING) class _CuDFBuffer: @@ -212,7 +212,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: raise ValueError(f"Data type {dtype} not supported by exchange" "protocol") - if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING): + if kind not in _SUPPORTED_KINDS: raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 @@ -277,7 +277,7 @@ def describe_null(self) -> Tuple[int, Any]: # there is no validity mask so it is non-nullable return 0, None - elif kind in SUPPORTED_DTYPE: + elif kind in _SUPPORTED_KINDS: # bit mask is universally used in cudf for missing return 3, 0 @@ -368,7 +368,6 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype encoded_string = self._col.children[1] buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(encoded_string.dtype) - # dtype = (_k.STRING, 8, "u", "=") else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -396,13 +395,11 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: return buffer, dtype elif null == 1: - msg = "This column uses NaN as null so does not have a separate mask" + raise RuntimeError("This column uses NaN as null so does not have a separate mask") elif null == 0: - msg = "This column is non-nullable so does not have a mask" + raise RuntimeError("This column is non-nullable so does not have a mask") else: - raise NotImplementedError("See self.describe_null") - - raise RuntimeError(msg) + raise NotImplementedError(f"See {self.__class__.__name__}.describe_null method.") def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: """ @@ -422,6 +419,7 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: return buffer, dtype + class _CuDFDataFrame: """ A data frame class, with only the methods required by the interchange @@ -501,9 +499,6 @@ def __dataframe__(self, nan_as_null : bool = False, """ The public method to attach to cudf.DataFrame. - We'll attach it via monkey-patching here for demo purposes. If Pandas adopts - the protocol, this will be a regular method on pandas.DataFrame. - ``nan_as_null`` is a keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension @@ -513,7 +508,7 @@ def __dataframe__(self, nan_as_null : bool = False, allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. - Currently, if the flag is set to ``False`` and a copy is needed, a + Currently, if this flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. """ return _CuDFDataFrame( @@ -609,13 +604,13 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \ raise NotImplementedError("column.offset > 0 not handled yet") _dbuffer, _ddtype = col.get_buffers()['data'] - _check_data_is_on_gpu(_dbuffer) + _check_buffer_is_on_gpu(_dbuffer) dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_cupy_dtype(_ddtype)) return _set_missing_values(col, dcol), _dbuffer -def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None: +def _check_buffer_is_on_gpu(buffer : _CuDFBuffer) -> None: if buffer.__dlpack_device__()[0] != _Device.CUDA and not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU. " "Set `allow_copy=True` to allow it.") @@ -637,8 +632,7 @@ def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype: kind = _dtype[0] bitwidth = _dtype[1] _k = _DtypeKind - if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL, - _k.STRING, _k.DATETIME): + if _dtype[0] not in _SUPPORTED_KINDS: raise RuntimeError(f"Data type {_dtype[0]} not handled yet") return _CP_DTYPES[kind][bitwidth] @@ -654,7 +648,7 @@ def _protocol_column_to_cudf_column_categorical(col : ColumnObject) -> \ categories = as_column(mapping.values()) codes_buffer, codes_dtype = col.get_buffers()['data'] - _check_data_is_on_gpu(codes_buffer) + _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) @@ -673,16 +667,17 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \ buffers = col.get_buffers() # Retrieve the data buffer containing the UTF-8 code units - dbuffer, bdtype = buffers["data"] - _check_data_is_on_gpu(dbuffer) - encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize), - protocol_dtype_to_cupy_dtype(bdtype) + data_buffer, data_dtype = buffers["data"] + _check_buffer_is_on_gpu(data_buffer) + encoded_string = build_column(Buffer(data_buffer.ptr, data_buffer.bufsize), + protocol_dtype_to_cupy_dtype(data_dtype) ) # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = buffers["offsets"] - offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), - protocol_dtype_to_cupy_dtype(odtype) + offset_buffer, offset_dtype = buffers["offsets"] + _check_buffer_is_on_gpu(offset_buffer) + offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize), + protocol_dtype_to_cupy_dtype(offset_dtype) ) col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index ac99943b671..d66b0751780 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -39,7 +39,6 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): # check that non null values are the equals as null are represented # by sentinel values in the buffer. non_null_idxs = cudfcol != None - print(non_null_idxs, cudfcol is not None) assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) if dtype[0] != _DtypeKind.BOOL: From 3eefe0c6b6ef0e31517f40ed3e41fb9c587ac538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Mon, 1 Nov 2021 17:29:27 +0100 Subject: [PATCH 38/60] change bare exception into RuntimeError Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index b9c3efac707..3de1fb084c2 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -337,7 +337,7 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]: buffers["data"] = self._get_data_buffer() try: buffers["validity"] = self._get_validity_buffer() - except: + except RuntimeError: buffers["validity"] = None try: From a11ddb6ff09b23a286a8a8a702683305670b904b Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 2 Nov 2021 08:18:31 +0000 Subject: [PATCH 39/60] fix flake8 style checks --- python/cudf/cudf/core/df_protocol.py | 262 +++++++++++++++------------ 1 file changed, 147 insertions(+), 115 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index d0b44ea597f..2bad99bea77 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,6 +1,5 @@ import enum import collections -import ctypes from typing import Any, Optional, Tuple, Dict, Iterable, Sequence import cudf @@ -11,9 +10,6 @@ from numba import cuda - - - # Implementation of interchange protocol classes # ---------------------------------------------- @@ -26,6 +22,7 @@ class _DtypeKind(enum.IntEnum): DATETIME = 22 CATEGORICAL = 23 + class _Device(enum.IntEnum): CPU = 1 CUDA = 2 @@ -36,16 +33,18 @@ class _Device(enum.IntEnum): VPI = 9 ROCM = 10 + _k = _DtypeKind _SUPPORTED_KINDS = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, - _k.BOOL, _k.STRING) + _k.BOOL, _k.STRING) + class _CuDFBuffer: """ Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, buf : cudf.core.buffer.Buffer, + def __init__(self, buf : cudf.core.buffer.Buffer, dtype: np.dtype, allow_copy : bool = True) -> None: """ Use cudf.core.buffer.Buffer object. @@ -53,7 +52,7 @@ def __init__(self, buf : cudf.core.buffer.Buffer, # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes self._buf = buf - self._dtype = dtype + self._dtype = dtype self._allow_copy = allow_copy @property @@ -69,12 +68,12 @@ def ptr(self) -> int: Pointer to start of the buffer as an integer. """ return self._buf.ptr - + def __dlpack__(self) : """ DLPack not implemented in NumPy yet, so leave it out here. """ - try: + try: cudarray = cuda.as_cuda_array(self._buf).view(self._dtype) res = cp.asarray(cudarray).toDlpack() @@ -91,10 +90,14 @@ def __dlpack_device__(self) -> Tuple[_Device, int]: def __repr__(self) -> str: return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize, - 'ptr': self.ptr, - 'dlpack': self.__dlpack__(), - 'device': self.__dlpack_device__()[0].name} - ) + ')' + 'ptr': self.ptr, + 'dlpack': + self.__dlpack__(), + 'device': + self.__dlpack_device__()[0] + .name}) + + ')' + class _CuDFColumn: """ @@ -112,7 +115,7 @@ class _CuDFColumn: """ def __init__(self, column: cudf.core.column.ColumnBase, - nan_as_null : bool = True, + nan_as_null : bool = True, allow_copy: bool = True) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular @@ -144,7 +147,8 @@ def offset(self) -> int: @property def dtype(self) -> Tuple[_DtypeKind, int, str, str]: """ - Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` + Dtype description as a tuple + ``(kind, bit-width, format string, endianness)`` Kind : @@ -163,27 +167,28 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]: Notes: - - Kind specifiers are aligned with DLPack where possible (hence the - jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for bit - masks) or 8 (for byte masks). + - Kind specifiers are aligned with DLPack where possible + (hence the jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 + (for bit masks) or 8 (for byte masks). - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the future - we need to support non-native endianness + - Endianness isn't too useful, but included now in case + in the future we need to support non-native endianness - Went with Apache Arrow format strings over NumPy format strings because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, and - for categoricals. + - Format strings are mostly useful for datetime specification, + and for categoricals. - For categoricals, the format string describes the type of the - categorical in the data buffer. In case of a separate encoding of - the categorical (e.g. an integer to string mapping), this can - be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, decimal, - and nested (list, struct, map, union) dtypes. + categorical in the data buffer. In case of a separate encoding + of the categorical (e.g. an integer to string mapping), + this can be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, + binary, decimal, and nested (list, struct, map, union) dtypes. """ dtype = self._col.dtype - # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings + # For now, assume that, if the column dtype is 'O' (i.e., `object`), + # then we have an array of strings if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == 'O': return (_DtypeKind.STRING, 8, 'u', '=') @@ -194,8 +199,9 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: See `self.dtype` for details. """ # Note: 'c' (complex) not handled yet (not in array spec v1). - # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled - # datetime and timedelta both map to datetime (is timedelta handled?) + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) + # not handled datetime and timedelta both map to datetime + # (is timedelta handled?) _k = _DtypeKind _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL, "U": _k.STRING, @@ -206,7 +212,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: if isinstance(dtype, cudf.CategoricalDtype): kind = _k.CATEGORICAL # Codes and categories' dtypes are different. - # We use codes' dtype as these are stored in the buffer. + # We use codes' dtype as these are stored in the buffer. dtype = self._col.codes.dtype else: raise ValueError(f"Data type {dtype} not supported by exchange" @@ -232,23 +238,21 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: Content of returned dict: - - "is_ordered" : bool, whether the ordering of dictionary indices is - semantically meaningful. + - "is_ordered" : bool, whether the ordering of dictionary + indices is semantically meaningful. - "is_dictionary" : bool, whether a dictionary-style mapping of categorical values to other objects exists - "mapping" : dict, Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. """ if not self.dtype[0] == _DtypeKind.CATEGORICAL: - raise TypeError("`describe_categorical only works on a column with " - "categorical dtype!") + raise TypeError("`describe_categorical only works on " + "a column with categorical dtype!") ordered = self._col.dtype.ordered is_dictionary = True # NOTE: this shows the children approach is better, transforming # `categories` to a "mapping" dict is inefficient - codes = self._col.codes # ndarray, length `self.size` - # categories.values is ndarray of length n_categories categories = self._col.categories mapping = {ix: val for ix, val in enumerate(categories.values_host)} return ordered, is_dictionary, mapping @@ -267,11 +271,11 @@ def describe_null(self) -> Tuple[int, Any]: - 3 : bit mask - 4 : byte mask - Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. None - otherwise. + Value : if kind is "sentinel value", the actual value. + If kind is a bit mask or a byte mask, the value (0 or 1) + indicating a missing value. + None otherwise. """ - _k = _DtypeKind kind = self.dtype[0] if self.null_count == 0: # there is no validity mask so it is non-nullable @@ -280,9 +284,10 @@ def describe_null(self) -> Tuple[int, Any]: elif kind in _SUPPORTED_KINDS: # bit mask is universally used in cudf for missing return 3, 0 - + else: - raise NotImplementedError(f"Data type {self.dtype} not yet supported") + raise NotImplementedError(f"Data type {self.dtype}" + " not yet supported") @property def null_count(self) -> int: @@ -304,7 +309,8 @@ def num_chunks(self) -> int: """ return 1 - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']: + def get_chunks(self, n_chunks : Optional[int] = None) ->\ + Iterable['_CuDFColumn']: """ Return an iterable yielding the chunks. @@ -337,74 +343,84 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]: buffers["data"] = self._get_data_buffer() try: buffers["validity"] = self._get_validity_buffer() - except: + except RuntimeError: buffers["validity"] = None try: buffers["offsets"] = self._get_offsets_buffer() - except: + except RuntimeError: buffers["offsets"] = None return buffers - def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]: # Any is for self.dtype tuple + def _get_data_buffer(self) -> Tuple[_CuDFBuffer, + Tuple[_DtypeKind, int, str, str]]: """ - Return the buffer containing the data and the buffer's associated dtype. + Return the buffer containing the data and + the buffer's associated dtype. """ _k = _DtypeKind - invalid = self.describe_null[1] if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer(self._col.data, self._col.dtype, + buffer = _CuDFBuffer(self._col.data, self._col.dtype, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.codes - buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype, + buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype, allow_copy=self._allow_copy) dtype = self._dtype_from_cudfdtype(codes.dtype) elif self.dtype[0] == _k.STRING: encoded_string = self._col.children[1] - buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, allow_copy=self._allow_copy) - dtype = self._dtype_from_cudfdtype(encoded_string.dtype) + buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, + allow_copy=self._allow_copy) + dtype = self._dtype_from_cudfdtype(encoded_string.dtype) else: - raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + raise NotImplementedError(f"Data type {self._col.dtype}" + " not handled yet") return buffer, dtype def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: """ - Return the buffer containing the mask values indicating missing data and - the buffer's associated dtype. + Return the buffer containing the mask values + indicating missing data and the buffer's associated dtype. Raises RuntimeError if null representation is not a bit or byte mask. """ - + null, invalid = self.describe_null if null == 3: _k = _DtypeKind if self.dtype[0] == _k.CATEGORICAL: - buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, cp.uint8, - allow_copy=self._allow_copy) + buffer = _CuDFBuffer(self._col.codes. + _get_mask_as_column().data, + cp.uint8, allow_copy=self._allow_copy) else: - buffer = _CuDFBuffer(self._col._get_mask_as_column().data, cp.uint8, + buffer = _CuDFBuffer(self._col. + _get_mask_as_column().data, + cp.uint8, allow_copy=self._allow_copy) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype elif null == 1: - raise RuntimeError("This column uses NaN as null so does not have a separate mask") - elif null == 0: - raise RuntimeError("This column is non-nullable so does not have a mask") + raise RuntimeError("This column uses NaN as null " + "so does not have a separate mask") + elif null == 0: + raise RuntimeError("This column is non-nullable" + " so does not have a mask") else: - raise NotImplementedError(f"See {self.__class__.__name__}.describe_null method.") + raise NotImplementedError(f"See {self.__class__.__name__}" + ".describe_null method.") def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: """ - Return the buffer containing the offset values for variable-size binary - data (e.g., variable-length strings) and the buffer's associated dtype. + Return the buffer containing the offset values for + variable-size binary data (e.g., variable-length strings) + and the buffer's associated dtype. Raises RuntimeError if the data buffer does not have an associated offsets buffer. @@ -412,10 +428,12 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: _k = _DtypeKind if self.dtype[0] == _k.STRING: offsets = self._col.children[0] - buffer = _CuDFBuffer(offsets.data, offsets.dtype, allow_copy=self._allow_copy) - dtype = self._dtype_from_cudfdtype(offsets.dtype) + buffer = _CuDFBuffer(offsets.data, offsets.dtype, + allow_copy=self._allow_copy) + dtype = self._dtype_from_cudfdtype(offsets.dtype) else: - raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + raise RuntimeError("This column has a fixed-length dtype " + "so does not have an offsets buffer") return buffer, dtype @@ -438,7 +456,8 @@ def __init__(self, df : 'cudf.core.dataframe.DataFrame', """ self._df = df # ``nan_as_null`` is a keyword intended for the consumer to tell the - # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + # producer to overwrite null values in the data with + # ``NaN`` (or ``NaT``). # This currently has no effect; once support for nullable extension # dtypes is added, this value should be propagated to columns. self._nan_as_null = nan_as_null @@ -447,7 +466,7 @@ def __init__(self, df : 'cudf.core.dataframe.DataFrame', @property def metadata(self): # `index` isn't a regular column, and the protocol doesn't support row - # labels - so we export it as Pandas-specific metadata here. + # labels - so we export it as cuDF-specific metadata here. return {"cudf.index": self._df.index} def num_columns(self) -> int: @@ -480,14 +499,16 @@ def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame': return _CuDFDataFrame(self._df.iloc[:, indices]) - def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame': + def select_columns_by_name(self, names: Sequence[str]) ->\ + '_CuDFDataFrame': if not isinstance(names, collections.Sequence): raise ValueError("`names` is not a sequence") return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null, - self._allow_copy) + self._allow_copy) - def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']: + def get_chunks(self, n_chunks : Optional[int] = None) -> \ + Iterable['_CuDFDataFrame']: """ Return an iterator yielding the chunks. """ @@ -528,8 +549,8 @@ def __dataframe__(self, nan_as_null : bool = False, ----- - Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to - do in pure Python. It's more general but definitely less friendly than having - ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack + do in pure Python. It's more general but definitely less friendly than + having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), this is worth looking at again. @@ -548,7 +569,8 @@ def __dataframe__(self, nan_as_null : bool = False, _CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}} -def from_dataframe(df : DataFrameObject, allow_copy: bool = False) -> _CuDFDataFrame : +def from_dataframe(df : DataFrameObject, allow_copy: bool = False) ->\ + _CuDFDataFrame : """ Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -577,17 +599,18 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : col = df.get_column_by_name(name) if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - columns[name], _buf = _protocol_column_to_cudf_column_numeric(col) + columns[name], _buf = _protocol_to_cudf_column_numeric(col) elif col.dtype[0] == _k.CATEGORICAL: - columns[name], _buf = _protocol_column_to_cudf_column_categorical(col) + columns[name], _buf = _protocol_to_cudf_column_categorical(col) elif col.dtype[0] == _k.STRING: - columns[name], _buf = _protocol_column_to_cudf_column_string(col) - + columns[name], _buf = _protocol_to_cudf_column_string(col) + else: - raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") - + raise NotImplementedError(f"Data type {col.dtype[0]}" + " not handled yet") + _buffers.append(_buf) df_new = cudf.DataFrame._from_data(columns) @@ -595,71 +618,80 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : return df_new -def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \ - Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]: +def _protocol_to_cudf_column_numeric(col: ColumnObject) -> \ + Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]: """ - Convert an int, uint, float or bool protocol column to the corresponding cudf column + Convert an int, uint, float or bool protocol column + to the corresponding cudf column """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") _dbuffer, _ddtype = col.get_buffers()['data'] _check_buffer_is_on_gpu(_dbuffer) - dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), - protocol_dtype_to_cupy_dtype(_ddtype)) + dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), + protocol_dtype_to_cupy_dtype(_ddtype)) return _set_missing_values(col, dcol), _dbuffer def _check_buffer_is_on_gpu(buffer : _CuDFBuffer) -> None: - if buffer.__dlpack_device__()[0] != _Device.CUDA and not buffer._allow_copy: + if buffer.__dlpack_device__()[0] != _Device.CUDA and \ + not buffer._allow_copy: raise TypeError("This operation must copy data from CPU to GPU. " - "Set `allow_copy=True` to allow it.") + "Set `allow_copy=True` to allow it.") + -def _set_missing_values(protocol_col: _CuDFColumn, - cudf_col:'cudf.core.dataframe.DataFrame') \ - -> cudf.core.column.ColumnBase: +def _set_missing_values(protocol_col: _CuDFColumn, + cudf_col: 'cudf.core.dataframe.DataFrame') -> \ + cudf.core.column.ColumnBase: null_kind, null_value = protocol_col.describe_null if null_kind != 0: - assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." + assert null_kind == 3, "cudf supports only bit mask, " + f"null_kind should be 3, got: {null_kind}." _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] - bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) + bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), + cp.bool8) cudf_col[~bitmask] = None return cudf_col + def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype: kind = _dtype[0] bitwidth = _dtype[1] - _k = _DtypeKind if _dtype[0] not in _SUPPORTED_KINDS: raise RuntimeError(f"Data type {_dtype[0]} not handled yet") - + return _CP_DTYPES[kind][bitwidth] -def _protocol_column_to_cudf_column_categorical(col : ColumnObject) -> \ - Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] : + +def _protocol_to_cudf_column_categorical(col : ColumnObject) -> \ + Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] : """ Convert a categorical column to a Series instance """ ordered, is_dict, mapping = col.describe_categorical if not is_dict: - raise NotImplementedError('Non-dictionary categoricals not supported yet') + raise NotImplementedError("Non-dictionary categoricals" + " not supported yet") categories = as_column(mapping.values()) codes_buffer, codes_dtype = col.get_buffers()['data'] _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) - codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype) - - cudfcol = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask, - size=codes.size,ordered=ordered) + codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), + cdtype) + + cudfcol = build_categorical_column(categories=categories, codes=codes, + mask=codes.base_mask, size=codes.size, + ordered=ordered) return _set_missing_values(col, cudfcol), codes_buffer -def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \ - Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] : +def _protocol_to_cudf_column_string(col : ColumnObject) -> \ + Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] : """ Convert a string ColumnObject to cudf Column object. """ @@ -670,16 +702,16 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \ data_buffer, data_dtype = buffers["data"] _check_buffer_is_on_gpu(data_buffer) encoded_string = build_column(Buffer(data_buffer.ptr, data_buffer.bufsize), - protocol_dtype_to_cupy_dtype(data_dtype) - ) + protocol_dtype_to_cupy_dtype(data_dtype)) - # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + # Retrieve the offsets buffer containing the index offsets demarcating + # the beginning and end of each string offset_buffer, offset_dtype = buffers["offsets"] _check_buffer_is_on_gpu(offset_buffer) - offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize), - protocol_dtype_to_cupy_dtype(offset_dtype) - ) - - col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string)) + offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize), + protocol_dtype_to_cupy_dtype(offset_dtype)) + + col_str = build_column(None, dtype=cp.dtype('O'), + children=(offsets, encoded_string)) return _set_missing_values(col, col_str), buffers From 117e4321bb2b51c8c543ba5241234125997967ac Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 2 Nov 2021 08:57:06 +0000 Subject: [PATCH 40/60] fix flake8 style checks for 'test_df_protocol.py' file --- python/cudf/cudf/tests/test_df_protocol.py | 88 ++++++++++------------ 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d66b0751780..a80b94dc419 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,12 +1,9 @@ -import datetime import cupy as cp -import numpy as np import pytest from cudf.core.df_protocol import ( - _from_dataframe, + _from_dataframe, _DtypeKind, protocol_dtype_to_cupy_dtype, - _CuDFDataFrame, _CuDFColumn, _CuDFBuffer ) @@ -14,33 +11,24 @@ import cudf from cudf.core.column import build_column from cudf.core.buffer import Buffer -from cudf.testing import _utils as utils -from cudf.testing._utils import ( - ALL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - assert_eq, - assert_exceptions_equal, - does_not_raise, - gen_rand, -) +from cudf.testing._utils import assert_eq import pandas as pd from typing import Any, Tuple DataFrameObject = Any + def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize), - protocol_dtype_to_cupy_dtype(dtype) - ) + protocol_dtype_to_cupy_dtype(dtype)) # check that non null values are the equals as null are represented # by sentinel values in the buffer. - non_null_idxs = cudfcol != None + non_null_idxs = cudfcol != cudf.NA assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) - + if dtype[0] != _DtypeKind.BOOL: array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) col_array = cp.asarray(cudfcol.data_array_view) @@ -49,42 +37,37 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): pytest.raises(TypeError, buf.__dlpack__) - - def assert_column_equal(col: _CuDFColumn, cudfcol): - assert col.size == cudfcol.size + assert col.size == cudfcol.size assert col.offset == 0 assert col.null_count == cudfcol.null_count assert col.num_chunks() == 1 if col.null_count == 0 : pytest.raises(RuntimeError, col._get_validity_buffer) - assert col.get_buffers()['validity'] == None + assert col.get_buffers()['validity'] is None else: assert_buffer_equal(col.get_buffers()['validity'], cudfcol._get_mask_as_column().astype(cp.uint8)) - + if col.dtype[0] == _DtypeKind.CATEGORICAL: assert_buffer_equal(col.get_buffers()['data'], cudfcol.codes) - assert col.get_buffers()['offsets'] == None + assert col.get_buffers()['offsets'] is None elif col.dtype[0] == _DtypeKind.STRING: assert_buffer_equal(col.get_buffers()['data'], cudfcol.children[1]) assert_buffer_equal(col.get_buffers()['offsets'], cudfcol.children[0]) - + else: assert_buffer_equal(col.get_buffers()['data'], cudfcol) - assert col.get_buffers()['offsets'] == None + assert col.get_buffers()['offsets'] is None - null_kind, null_value = col.describe_null if col.null_count == 0: - assert null_kind == 0 - assert null_value == None + assert col.describe_null == (0, None) else: - assert null_kind == 3 - assert null_value == 0 + assert col.describe_null == (3, 0) -def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame): +def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): assert dfo.num_columns() == len(df.columns) assert dfo.num_rows() == len(df) assert dfo.num_chunks() == 1 @@ -108,29 +91,35 @@ def _test_from_dataframe_equals(dfobj): def _test_from_dataframe_exception(dfobj): - exception_msg = "This operation must copy data from CPU to GPU. Set `allow_copy=True` to allow it." + exception_msg = "This operation must copy data from CPU to GPU." + " Set `allow_copy=True` to allow it." with pytest.raises(TypeError, match=exception_msg): - df2 = _from_dataframe(dfobj) + _from_dataframe(dfobj) + def _test_datatype(data): cdf = cudf.DataFrame(data=data) _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + def test_from_dataframe(): data = dict(a=[1, 2, 3], b=[9, 10, 11]) df1 = cudf.DataFrame(data=data) df2 = cudf.from_dataframe(df1) assert_eq(df1, df2) - + + def test_int_dtype(): data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) _test_datatype(data_int) + def test_float_dtype(): data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) _test_datatype(data_float) + def test_categorical_dtype(): cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) cdf["A"] = cdf["A"].astype("category") @@ -140,6 +129,7 @@ def test_categorical_dtype(): _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + def test_bool_dtype(): data_bool = dict(a=[True, True, False], b=[False, True, False]) _test_datatype(data_bool) @@ -152,23 +142,25 @@ def test_string_dtype(): def test_mixed_dtype(): data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5], - bool=[True, False, True], categorical=[5, 1, 5], - string=["rapidsai-cudf ", "", "df protocol"]) + bool=[True, False, True], categorical=[5, 1, 5], + string=["rapidsai-cudf ", "", "df protocol"]) _test_datatype(data_mixed) def test_NA_int_dtype(): - data_int = dict(a=[1, None, 3, None, 5], + data_int = dict(a=[1, None, 3, None, 5], b=[9, 10, None, 7, 8], - c= [6, 19, 20, 100, 1000] ) + c=[6, 19, 20, 100, 1000]) _test_datatype(data_int) + def test_NA_float_dtype(): - data_float = dict(a=[1.4, None, 3.6, None, 5.2], - b=[9.7, 10.9, None, 7.8, 8.2], - c= [6.1, 19.2, 20.3, 100.4, 1000.5] ) + data_float = dict(a=[1.4, None, 3.6, None, 5.2], + b=[9.7, 10.9, None, 7.8, 8.2], + c=[6.1, 19.2, 20.3, 100.4, 1000.5]) _test_datatype(data_float) + def test_NA_categorical_dtype(): df = cudf.DataFrame({"A": [1, 2, 5, 1]}) df["B"] = df["A"].astype("category") @@ -184,10 +176,12 @@ def test_NA_categorical_dtype(): _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + def test_NA_bool_dtype(): data_bool = dict(a=[None, True, False], b=[False, None, None]) _test_datatype(data_bool) + def test_NA_string_dtype(): df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]}) df["B"] = df["A"].astype("object") @@ -204,9 +198,9 @@ def test_NA_string_dtype(): def test_NA_mixed_dtype(): - data_mixed = dict(int=[1, None, 2, 3, 1000], float=[None, 1.5, 2.5, 3.5, None], - bool=[True, None, False, None, None], - categorical=[5, 1, 5, 3, None], - string=[None, None, None, "df protocol", None]) + data_mixed = dict(int=[1, None, 2, 3, 1000], + float=[None, 1.5, 2.5, 3.5, None], + bool=[True, None, False, None, None], + categorical=[5, 1, 5, 3, None], + string=[None, None, None, "df protocol", None]) _test_datatype(data_mixed) - From 5eb76420cd8084daf42bd40fdb59528185ded68a Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 2 Nov 2021 08:58:36 +0000 Subject: [PATCH 41/60] isort formatting --- python/cudf/cudf/core/df_protocol.py | 12 ++++++------ python/cudf/cudf/tests/test_df_protocol.py | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 2bad99bea77..6285c283c02 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,14 +1,14 @@ -import enum import collections -from typing import Any, Optional, Tuple, Dict, Iterable, Sequence +import enum +from typing import Any, Dict, Iterable, Optional, Sequence, Tuple -import cudf -from cudf.core.column import as_column, build_column, build_categorical_column -from cudf.core.buffer import Buffer -import numpy as np import cupy as cp +import numpy as np from numba import cuda +import cudf +from cudf.core.buffer import Buffer +from cudf.core.column import as_column, build_categorical_column, build_column # Implementation of interchange protocol classes # ---------------------------------------------- diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index a80b94dc419..45fb30ce998 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,19 +1,20 @@ +from typing import Any, Tuple + import cupy as cp +import pandas as pd import pytest + +import cudf +from cudf.core.buffer import Buffer +from cudf.core.column import build_column from cudf.core.df_protocol import ( - _from_dataframe, + _CuDFBuffer, + _CuDFColumn, _DtypeKind, + _from_dataframe, protocol_dtype_to_cupy_dtype, - _CuDFColumn, - _CuDFBuffer ) - -import cudf -from cudf.core.column import build_column -from cudf.core.buffer import Buffer from cudf.testing._utils import assert_eq -import pandas as pd -from typing import Any, Tuple DataFrameObject = Any From e164540aa2ecddd1a400d6d9e9159e54f9f303dc Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 2 Nov 2021 09:09:32 +0000 Subject: [PATCH 42/60] run 'black' to format code. --- python/cudf/cudf/core/df_protocol.py | 318 +++++++++++++-------- python/cudf/cudf/tests/test_df_protocol.py | 71 +++-- 2 files changed, 238 insertions(+), 151 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 6285c283c02..debb0d29079 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -13,12 +13,13 @@ # Implementation of interchange protocol classes # ---------------------------------------------- + class _DtypeKind(enum.IntEnum): INT = 0 UINT = 1 FLOAT = 2 BOOL = 20 - STRING = 21 # UTF-8 + STRING = 21 # UTF-8 DATETIME = 22 CATEGORICAL = 23 @@ -35,8 +36,14 @@ class _Device(enum.IntEnum): _k = _DtypeKind -_SUPPORTED_KINDS = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, - _k.BOOL, _k.STRING) +_SUPPORTED_KINDS = ( + _k.INT, + _k.UINT, + _k.FLOAT, + _k.CATEGORICAL, + _k.BOOL, + _k.STRING, +) class _CuDFBuffer: @@ -44,8 +51,12 @@ class _CuDFBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, buf : cudf.core.buffer.Buffer, - dtype: np.dtype, allow_copy : bool = True) -> None: + def __init__( + self, + buf: cudf.core.buffer.Buffer, + dtype: np.dtype, + allow_copy: bool = True, + ) -> None: """ Use cudf.core.buffer.Buffer object. """ @@ -69,7 +80,7 @@ def ptr(self) -> int: """ return self._buf.ptr - def __dlpack__(self) : + def __dlpack__(self): """ DLPack not implemented in NumPy yet, so leave it out here. """ @@ -78,7 +89,7 @@ def __dlpack__(self) : res = cp.asarray(cudarray).toDlpack() except ValueError: - raise TypeError(f'dtype {self._dtype} unsupported by `dlpack`') + raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`") return res @@ -89,14 +100,15 @@ def __dlpack_device__(self) -> Tuple[_Device, int]: return (_Device.CUDA, cp.asarray(self._buf).device.id) def __repr__(self) -> str: - return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize, - 'ptr': self.ptr, - 'dlpack': - self.__dlpack__(), - 'device': - self.__dlpack_device__()[0] - .name}) - + ')' + return f"{self.__class__.__name__}(" + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "dlpack": self.__dlpack__(), + "device": self.__dlpack_device__()[0].name, + } + ) + +")" class _CuDFColumn: @@ -114,16 +126,20 @@ class _CuDFColumn: """ - def __init__(self, column: cudf.core.column.ColumnBase, - nan_as_null : bool = True, - allow_copy: bool = True) -> None: + def __init__( + self, + column: cudf.core.column.ColumnBase, + nan_as_null: bool = True, + allow_copy: bool = True, + ) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ if not isinstance(column, cudf.Series): - raise NotImplementedError("Columns of type {} not handled " - "yet".format(type(column))) + raise NotImplementedError( + "Columns of type {} not handled " "yet".format(type(column)) + ) # Store the column as a private attribute self._col = as_column(column) @@ -189,8 +205,8 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]: # For now, assume that, if the column dtype is 'O' (i.e., `object`), # then we have an array of strings - if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == 'O': - return (_DtypeKind.STRING, 8, 'u', '=') + if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O": + return (_DtypeKind.STRING, 8, "u", "=") return self._dtype_from_cudfdtype(dtype) @@ -203,9 +219,15 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: # not handled datetime and timedelta both map to datetime # (is timedelta handled?) _k = _DtypeKind - _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL, - "U": _k.STRING, - "M": _k.DATETIME, "m": _k.DATETIME} + _np_kinds = { + "i": _k.INT, + "u": _k.UINT, + "f": _k.FLOAT, + "b": _k.BOOL, + "U": _k.STRING, + "M": _k.DATETIME, + "m": _k.DATETIME, + } kind = _np_kinds.get(dtype.kind, None) if kind is None: # Not a NumPy/CuPy dtype. Check if it's a categorical maybe @@ -215,15 +237,16 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: # We use codes' dtype as these are stored in the buffer. dtype = self._col.codes.dtype else: - raise ValueError(f"Data type {dtype} not supported by exchange" - "protocol") + raise ValueError( + f"Data type {dtype} not supported by exchange" "protocol" + ) if kind not in _SUPPORTED_KINDS: raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 format_str = dtype.str - endianness = dtype.byteorder if kind != _k.CATEGORICAL else '=' + endianness = dtype.byteorder if kind != _k.CATEGORICAL else "=" return (kind, bitwidth, format_str, endianness) @property @@ -246,8 +269,10 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: None if not a dictionary-style categorical. """ if not self.dtype[0] == _DtypeKind.CATEGORICAL: - raise TypeError("`describe_categorical only works on " - "a column with categorical dtype!") + raise TypeError( + "`describe_categorical only works on " + "a column with categorical dtype!" + ) ordered = self._col.dtype.ordered is_dictionary = True @@ -286,8 +311,9 @@ def describe_null(self) -> Tuple[int, Any]: return 3, 0 else: - raise NotImplementedError(f"Data type {self.dtype}" - " not yet supported") + raise NotImplementedError( + f"Data type {self.dtype}" " not yet supported" + ) @property def null_count(self) -> int: @@ -309,8 +335,9 @@ def num_chunks(self) -> int: """ return 1 - def get_chunks(self, n_chunks : Optional[int] = None) ->\ - Iterable['_CuDFColumn']: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["_CuDFColumn"]: """ Return an iterable yielding the chunks. @@ -353,33 +380,42 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]: return buffers - def _get_data_buffer(self) -> Tuple[_CuDFBuffer, - Tuple[_DtypeKind, int, str, str]]: + def _get_data_buffer( + self, + ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]: """ Return the buffer containing the data and the buffer's associated dtype. """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer(self._col.data, self._col.dtype, - allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + self._col.data, self._col.dtype, allow_copy=self._allow_copy + ) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.codes - buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype, - allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + self._col.codes.data, + self._col.codes.dtype, + allow_copy=self._allow_copy, + ) dtype = self._dtype_from_cudfdtype(codes.dtype) elif self.dtype[0] == _k.STRING: encoded_string = self._col.children[1] - buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, - allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + encoded_string.data, + encoded_string.dtype, + allow_copy=self._allow_copy, + ) dtype = self._dtype_from_cudfdtype(encoded_string.dtype) else: - raise NotImplementedError(f"Data type {self._col.dtype}" - " not handled yet") + raise NotImplementedError( + f"Data type {self._col.dtype}" " not handled yet" + ) return buffer, dtype @@ -395,26 +431,33 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: if null == 3: _k = _DtypeKind if self.dtype[0] == _k.CATEGORICAL: - buffer = _CuDFBuffer(self._col.codes. - _get_mask_as_column().data, - cp.uint8, allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + self._col.codes._get_mask_as_column().data, + cp.uint8, + allow_copy=self._allow_copy, + ) else: - buffer = _CuDFBuffer(self._col. - _get_mask_as_column().data, - cp.uint8, - allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + self._col._get_mask_as_column().data, + cp.uint8, + allow_copy=self._allow_copy, + ) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype elif null == 1: - raise RuntimeError("This column uses NaN as null " - "so does not have a separate mask") + raise RuntimeError( + "This column uses NaN as null " + "so does not have a separate mask" + ) elif null == 0: - raise RuntimeError("This column is non-nullable" - " so does not have a mask") + raise RuntimeError( + "This column is non-nullable" " so does not have a mask" + ) else: - raise NotImplementedError(f"See {self.__class__.__name__}" - ".describe_null method.") + raise NotImplementedError( + f"See {self.__class__.__name__}" ".describe_null method." + ) def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: """ @@ -428,12 +471,15 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: _k = _DtypeKind if self.dtype[0] == _k.STRING: offsets = self._col.children[0] - buffer = _CuDFBuffer(offsets.data, offsets.dtype, - allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + offsets.data, offsets.dtype, allow_copy=self._allow_copy + ) dtype = self._dtype_from_cudfdtype(offsets.dtype) else: - raise RuntimeError("This column has a fixed-length dtype " - "so does not have an offsets buffer") + raise RuntimeError( + "This column has a fixed-length dtype " + "so does not have an offsets buffer" + ) return buffer, dtype @@ -447,9 +493,13 @@ class _CuDFDataFrame: ``cudf.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. """ - def __init__(self, df : 'cudf.core.dataframe.DataFrame', - nan_as_null : bool = True, - allow_copy : bool = True) -> None: + + def __init__( + self, + df: "cudf.core.dataframe.DataFrame", + nan_as_null: bool = True, + allow_copy: bool = True, + ) -> None: """ Constructor - an instance of this (private) class is returned from `cudf.DataFrame.__dataframe__`. @@ -482,41 +532,43 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn( - self._df.iloc[:, i], allow_copy=self._allow_copy) + return _CuDFColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn( - self._df[name], allow_copy=self._allow_copy) + return _CuDFColumn(self._df[name], allow_copy=self._allow_copy) def get_columns(self) -> Iterable[_CuDFColumn]: - return [_CuDFColumn(self._df[name], allow_copy=self._allow_copy) - for name in self._df.columns] + return [ + _CuDFColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns + ] - def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame': + def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame": if not isinstance(indices, collections.abc.Sequence): raise ValueError("`indices` is not a sequence") return _CuDFDataFrame(self._df.iloc[:, indices]) - def select_columns_by_name(self, names: Sequence[str]) ->\ - '_CuDFDataFrame': + def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": if not isinstance(names, collections.Sequence): raise ValueError("`names` is not a sequence") - return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null, - self._allow_copy) + return _CuDFDataFrame( + self._df.loc[:, names], self._nan_as_null, self._allow_copy + ) - def get_chunks(self, n_chunks : Optional[int] = None) -> \ - Iterable['_CuDFDataFrame']: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["_CuDFDataFrame"]: """ Return an iterator yielding the chunks. """ return (self,) -def __dataframe__(self, nan_as_null : bool = False, - allow_copy : bool = True) -> _CuDFDataFrame: +def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True +) -> _CuDFDataFrame: """ The public method to attach to cudf.DataFrame. @@ -532,8 +584,7 @@ def __dataframe__(self, nan_as_null : bool = False, Currently, if this flag is set to ``False`` and a copy is needed, a ``RuntimeError`` will be raised. """ - return _CuDFDataFrame( - self, nan_as_null=nan_as_null, allow_copy=allow_copy) + return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy) """ @@ -569,21 +620,22 @@ def __dataframe__(self, nan_as_null : bool = False, _CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}} -def from_dataframe(df : DataFrameObject, allow_copy: bool = False) ->\ - _CuDFDataFrame : +def from_dataframe( + df: DataFrameObject, allow_copy: bool = False +) -> _CuDFDataFrame: """ Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` """ if isinstance(df, cudf.DataFrame): return df - if not hasattr(df, '__dataframe__'): + if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : +def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: """ Create a cudf DataFrame object from DataFrameObject. """ @@ -608,8 +660,9 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : columns[name], _buf = _protocol_to_cudf_column_string(col) else: - raise NotImplementedError(f"Data type {col.dtype[0]}" - " not handled yet") + raise NotImplementedError( + f"Data type {col.dtype[0]}" " not handled yet" + ) _buffers.append(_buf) @@ -618,8 +671,9 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame : return df_new -def _protocol_to_cudf_column_numeric(col: ColumnObject) -> \ - Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]: +def _protocol_to_cudf_column_numeric( + col: ColumnObject, +) -> Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]: """ Convert an int, uint, float or bool protocol column to the corresponding cudf column @@ -627,31 +681,38 @@ def _protocol_to_cudf_column_numeric(col: ColumnObject) -> \ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - _dbuffer, _ddtype = col.get_buffers()['data'] + _dbuffer, _ddtype = col.get_buffers()["data"] _check_buffer_is_on_gpu(_dbuffer) - dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), - protocol_dtype_to_cupy_dtype(_ddtype)) + dcol = build_column( + Buffer(_dbuffer.ptr, _dbuffer.bufsize), + protocol_dtype_to_cupy_dtype(_ddtype), + ) return _set_missing_values(col, dcol), _dbuffer -def _check_buffer_is_on_gpu(buffer : _CuDFBuffer) -> None: - if buffer.__dlpack_device__()[0] != _Device.CUDA and \ - not buffer._allow_copy: - raise TypeError("This operation must copy data from CPU to GPU. " - "Set `allow_copy=True` to allow it.") +def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: + if ( + buffer.__dlpack_device__()[0] != _Device.CUDA + and not buffer._allow_copy + ): + raise TypeError( + "This operation must copy data from CPU to GPU. " + "Set `allow_copy=True` to allow it." + ) -def _set_missing_values(protocol_col: _CuDFColumn, - cudf_col: 'cudf.core.dataframe.DataFrame') -> \ - cudf.core.column.ColumnBase: +def _set_missing_values( + protocol_col: _CuDFColumn, cudf_col: "cudf.core.dataframe.DataFrame" +) -> cudf.core.column.ColumnBase: null_kind, null_value = protocol_col.describe_null if null_kind != 0: assert null_kind == 3, "cudf supports only bit mask, " f"null_kind should be 3, got: {null_kind}." _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] - bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), - cp.bool8) + bitmask = cp.asarray( + Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8 + ) cudf_col[~bitmask] = None return cudf_col @@ -666,32 +727,40 @@ def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype: return _CP_DTYPES[kind][bitwidth] -def _protocol_to_cudf_column_categorical(col : ColumnObject) -> \ - Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] : +def _protocol_to_cudf_column_categorical( + col: ColumnObject, +) -> Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer]: """ Convert a categorical column to a Series instance """ ordered, is_dict, mapping = col.describe_categorical if not is_dict: - raise NotImplementedError("Non-dictionary categoricals" - " not supported yet") + raise NotImplementedError( + "Non-dictionary categoricals" " not supported yet" + ) categories = as_column(mapping.values()) - codes_buffer, codes_dtype = col.get_buffers()['data'] + codes_buffer, codes_dtype = col.get_buffers()["data"] _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) - codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), - cdtype) - - cudfcol = build_categorical_column(categories=categories, codes=codes, - mask=codes.base_mask, size=codes.size, - ordered=ordered) + codes = build_column( + Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype + ) + + cudfcol = build_categorical_column( + categories=categories, + codes=codes, + mask=codes.base_mask, + size=codes.size, + ordered=ordered, + ) return _set_missing_values(col, cudfcol), codes_buffer -def _protocol_to_cudf_column_string(col : ColumnObject) -> \ - Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] : +def _protocol_to_cudf_column_string( + col: ColumnObject, +) -> Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]]: """ Convert a string ColumnObject to cudf Column object. """ @@ -701,17 +770,22 @@ def _protocol_to_cudf_column_string(col : ColumnObject) -> \ # Retrieve the data buffer containing the UTF-8 code units data_buffer, data_dtype = buffers["data"] _check_buffer_is_on_gpu(data_buffer) - encoded_string = build_column(Buffer(data_buffer.ptr, data_buffer.bufsize), - protocol_dtype_to_cupy_dtype(data_dtype)) + encoded_string = build_column( + Buffer(data_buffer.ptr, data_buffer.bufsize), + protocol_dtype_to_cupy_dtype(data_dtype), + ) # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and end of each string offset_buffer, offset_dtype = buffers["offsets"] _check_buffer_is_on_gpu(offset_buffer) - offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize), - protocol_dtype_to_cupy_dtype(offset_dtype)) - - col_str = build_column(None, dtype=cp.dtype('O'), - children=(offsets, encoded_string)) + offsets = build_column( + Buffer(offset_buffer.ptr, offset_buffer.bufsize), + protocol_dtype_to_cupy_dtype(offset_dtype), + ) + + col_str = build_column( + None, dtype=cp.dtype("O"), children=(offsets, encoded_string) + ) return _set_missing_values(col, col_str), buffers diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 45fb30ce998..88e040cfdf3 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -23,8 +23,9 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) - col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize), - protocol_dtype_to_cupy_dtype(dtype)) + col_from_buf = build_column( + Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype) + ) # check that non null values are the equals as null are represented # by sentinel values in the buffer. non_null_idxs = cudfcol != cudf.NA @@ -43,24 +44,26 @@ def assert_column_equal(col: _CuDFColumn, cudfcol): assert col.offset == 0 assert col.null_count == cudfcol.null_count assert col.num_chunks() == 1 - if col.null_count == 0 : + if col.null_count == 0: pytest.raises(RuntimeError, col._get_validity_buffer) - assert col.get_buffers()['validity'] is None + assert col.get_buffers()["validity"] is None else: - assert_buffer_equal(col.get_buffers()['validity'], - cudfcol._get_mask_as_column().astype(cp.uint8)) + assert_buffer_equal( + col.get_buffers()["validity"], + cudfcol._get_mask_as_column().astype(cp.uint8), + ) if col.dtype[0] == _DtypeKind.CATEGORICAL: - assert_buffer_equal(col.get_buffers()['data'], cudfcol.codes) - assert col.get_buffers()['offsets'] is None + assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes) + assert col.get_buffers()["offsets"] is None elif col.dtype[0] == _DtypeKind.STRING: - assert_buffer_equal(col.get_buffers()['data'], cudfcol.children[1]) - assert_buffer_equal(col.get_buffers()['offsets'], cudfcol.children[0]) + assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1]) + assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0]) else: - assert_buffer_equal(col.get_buffers()['data'], cudfcol) - assert col.get_buffers()['offsets'] is None + assert_buffer_equal(col.get_buffers()["data"], cudfcol) + assert col.get_buffers()["offsets"] is None if col.null_count == 0: assert col.describe_null == (0, None) @@ -124,7 +127,7 @@ def test_float_dtype(): def test_categorical_dtype(): cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) cdf["A"] = cdf["A"].astype("category") - col = cdf.__dataframe__().get_column_by_name('A') + col = cdf.__dataframe__().get_column_by_name("A") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) @@ -142,33 +145,41 @@ def test_string_dtype(): def test_mixed_dtype(): - data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5], - bool=[True, False, True], categorical=[5, 1, 5], - string=["rapidsai-cudf ", "", "df protocol"]) + data_mixed = dict( + int=[1, 2, 3], + float=[1.5, 2.5, 3.5], + bool=[True, False, True], + categorical=[5, 1, 5], + string=["rapidsai-cudf ", "", "df protocol"], + ) _test_datatype(data_mixed) def test_NA_int_dtype(): - data_int = dict(a=[1, None, 3, None, 5], - b=[9, 10, None, 7, 8], - c=[6, 19, 20, 100, 1000]) + data_int = dict( + a=[1, None, 3, None, 5], + b=[9, 10, None, 7, 8], + c=[6, 19, 20, 100, 1000], + ) _test_datatype(data_int) def test_NA_float_dtype(): - data_float = dict(a=[1.4, None, 3.6, None, 5.2], - b=[9.7, 10.9, None, 7.8, 8.2], - c=[6.1, 19.2, 20.3, 100.4, 1000.5]) + data_float = dict( + a=[1.4, None, 3.6, None, 5.2], + b=[9.7, 10.9, None, 7.8, 8.2], + c=[6.1, 19.2, 20.3, 100.4, 1000.5], + ) _test_datatype(data_float) def test_NA_categorical_dtype(): df = cudf.DataFrame({"A": [1, 2, 5, 1]}) df["B"] = df["A"].astype("category") - df.at[[1, 3], 'B'] = None # Set two items to null + df.at[[1, 3], "B"] = None # Set two items to null # Some detailed testing for correctness of dtype and null handling: - col = df.__dataframe__().get_column_by_name('B') + col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.null_count == 2 assert col.describe_null == (3, 0) @@ -199,9 +210,11 @@ def test_NA_string_dtype(): def test_NA_mixed_dtype(): - data_mixed = dict(int=[1, None, 2, 3, 1000], - float=[None, 1.5, 2.5, 3.5, None], - bool=[True, None, False, None, None], - categorical=[5, 1, 5, 3, None], - string=[None, None, None, "df protocol", None]) + data_mixed = dict( + int=[1, None, 2, 3, 1000], + float=[None, 1.5, 2.5, 3.5, None], + bool=[True, None, False, None, None], + categorical=[5, 1, 5, 3, None], + string=[None, None, None, "df protocol", None], + ) _test_datatype(data_mixed) From 8b34a860bd72ccb39314c72e33c21d67eb6655ef Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 2 Nov 2021 09:35:23 +0000 Subject: [PATCH 43/60] fix test errors. --- python/cudf/cudf/tests/test_df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 88e040cfdf3..fb96c6b3698 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -28,7 +28,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): ) # check that non null values are the equals as null are represented # by sentinel values in the buffer. - non_null_idxs = cudfcol != cudf.NA + non_null_idxs = cudf.Series(cudfcol) != cudf.NA assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) if dtype[0] != _DtypeKind.BOOL: From eba1cdd8792b2ec892fbe783287608a776c0a62c Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 2 Nov 2021 09:36:13 +0000 Subject: [PATCH 44/60] minor style changes --- python/cudf/cudf/core/df_protocol.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index debb0d29079..458bdb962bb 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -138,7 +138,7 @@ def __init__( """ if not isinstance(column, cudf.Series): raise NotImplementedError( - "Columns of type {} not handled " "yet".format(type(column)) + "Columns of type {} not handled yet".format(type(column)) ) # Store the column as a private attribute @@ -238,7 +238,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: dtype = self._col.codes.dtype else: raise ValueError( - f"Data type {dtype} not supported by exchange" "protocol" + f"Data type {dtype} not supported by exchange protocol" ) if kind not in _SUPPORTED_KINDS: @@ -312,7 +312,7 @@ def describe_null(self) -> Tuple[int, Any]: else: raise NotImplementedError( - f"Data type {self.dtype}" " not yet supported" + f"Data type {self.dtype} not yet supported" ) @property @@ -414,12 +414,14 @@ def _get_data_buffer( else: raise NotImplementedError( - f"Data type {self._col.dtype}" " not handled yet" + f"Data type {self._col.dtype} not handled yet" ) return buffer, dtype - def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: + def _get_validity_buffer( + self, + ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -452,14 +454,16 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]: ) elif null == 0: raise RuntimeError( - "This column is non-nullable" " so does not have a mask" + "This column is non-nullable so does not have a mask" ) else: raise NotImplementedError( - f"See {self.__class__.__name__}" ".describe_null method." + f"See {self.__class__.__name__}.describe_null method." ) - def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]: + def _get_offsets_buffer( + self, + ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -661,7 +665,7 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: else: raise NotImplementedError( - f"Data type {col.dtype[0]}" " not handled yet" + f"Data type {col.dtype[0]} not handled yet" ) _buffers.append(_buf) @@ -736,7 +740,7 @@ def _protocol_to_cudf_column_categorical( ordered, is_dict, mapping = col.describe_categorical if not is_dict: raise NotImplementedError( - "Non-dictionary categoricals" " not supported yet" + "Non-dictionary categoricals not supported yet" ) categories = as_column(mapping.values()) From 5cffc2f9a1bfd6356173cf2f82f36bd87860c2b7 Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 3 Nov 2021 10:25:58 +0000 Subject: [PATCH 45/60] remove incorrect comment. --- python/cudf/cudf/core/df_protocol.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 458bdb962bb..71910d241c0 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -585,8 +585,6 @@ def __dataframe__( allowed to make a copy of the data. For example, copying data would be necessary if a library supports strided buffers, given that this protocol specifies contiguous buffers. - Currently, if this flag is set to ``False`` and a copy is needed, a - ``RuntimeError`` will be raised. """ return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy) From 5f27e66f725bb467d14546befe3dc7955af3c1f4 Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 3 Nov 2021 17:49:32 +0000 Subject: [PATCH 46/60] fix xome mypy check errors. --- python/cudf/cudf/core/df_protocol.py | 88 +++++++++++++--------- python/cudf/cudf/tests/test_df_protocol.py | 2 +- 2 files changed, 52 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 71910d241c0..9e2b8b33730 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,10 +1,19 @@ import collections import enum -from typing import Any, Dict, Iterable, Optional, Sequence, Tuple +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + Union, + Mapping +) import cupy as cp import numpy as np -from numba import cuda +from numba.cuda import as_cuda_array import cudf from cudf.core.buffer import Buffer @@ -44,6 +53,7 @@ class _Device(enum.IntEnum): _k.BOOL, _k.STRING, ) +ProtoDtype = Tuple[_DtypeKind, int, str, str] class _CuDFBuffer: @@ -85,7 +95,7 @@ def __dlpack__(self): DLPack not implemented in NumPy yet, so leave it out here. """ try: - cudarray = cuda.as_cuda_array(self._buf).view(self._dtype) + cudarray = as_cuda_array(self._buf).view(self._dtype) res = cp.asarray(cudarray).toDlpack() except ValueError: @@ -136,13 +146,12 @@ def __init__( Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ - if not isinstance(column, cudf.Series): - raise NotImplementedError( - "Columns of type {} not handled yet".format(type(column)) + if not isinstance(column, cudf.core.column.ColumnBase): + raise TypeError( + "column must be a subtype of df.core.column.ColumnBase," + f"got {type(column)}" ) - - # Store the column as a private attribute - self._col = as_column(column) + self._col = column self._nan_as_null = nan_as_null self._allow_copy = allow_copy @@ -161,7 +170,7 @@ def offset(self) -> int: return 0 @property - def dtype(self) -> Tuple[_DtypeKind, int, str, str]: + def dtype(self) -> ProtoDtype: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` @@ -210,7 +219,7 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]: return self._dtype_from_cudfdtype(dtype) - def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]: + def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: """ See `self.dtype` for details. """ @@ -345,7 +354,7 @@ def get_chunks( """ return (self,) - def get_buffers(self) -> Dict[str, _CuDFBuffer]: + def get_buffers(self) -> Mapping[str, Union[Tuple[_CuDFBuffer, ProtoDtype], None]]: """ Return a dictionary containing the underlying buffers. @@ -382,7 +391,7 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]: def _get_data_buffer( self, - ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]: + ) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the data and the buffer's associated dtype. @@ -421,7 +430,7 @@ def _get_data_buffer( def _get_validity_buffer( self, - ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]: + ) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -463,7 +472,7 @@ def _get_validity_buffer( def _get_offsets_buffer( self, - ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]: + ) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -536,14 +545,17 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) + return _CuDFColumn(as_column(self._df.iloc[:, i]), + allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn(self._df[name], allow_copy=self._allow_copy) + return _CuDFColumn(as_column(self._df[name]), + allow_copy=self._allow_copy) def get_columns(self) -> Iterable[_CuDFColumn]: return [ - _CuDFColumn(self._df[name], allow_copy=self._allow_copy) + _CuDFColumn(as_column(self._df[name]), + allow_copy=self._allow_copy) for name in self._df.columns ] @@ -674,22 +686,24 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: def _protocol_to_cudf_column_numeric( - col: ColumnObject, -) -> Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]: + col: _CuDFColumn, +) -> Tuple[cudf.core.column.NumericalColumn, + Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]: """ Convert an int, uint, float or bool protocol column to the corresponding cudf column """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - - _dbuffer, _ddtype = col.get_buffers()["data"] + + buffers = col.get_buffers() + _dbuffer, _ddtype = buffers["data"] _check_buffer_is_on_gpu(_dbuffer) dcol = build_column( Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_cupy_dtype(_ddtype), ) - return _set_missing_values(col, dcol), _dbuffer + return _set_missing_values(col, dcol), buffers def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: @@ -704,23 +718,20 @@ def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: def _set_missing_values( - protocol_col: _CuDFColumn, cudf_col: "cudf.core.dataframe.DataFrame" + protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase ) -> cudf.core.column.ColumnBase: - null_kind, null_value = protocol_col.describe_null - if null_kind != 0: - assert null_kind == 3, "cudf supports only bit mask, " - f"null_kind should be 3, got: {null_kind}." - _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"] + valid_mask = protocol_col.get_buffers()["validity"] + if valid_mask is not None: bitmask = cp.asarray( - Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8 + Buffer(valid_mask[0].ptr, valid_mask[0].bufsize), cp.bool8 ) cudf_col[~bitmask] = None return cudf_col -def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype: +def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: kind = _dtype[0] bitwidth = _dtype[1] if _dtype[0] not in _SUPPORTED_KINDS: @@ -730,8 +741,9 @@ def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( - col: ColumnObject, -) -> Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer]: + col: _CuDFColumn, +) -> Tuple[cudf.core.column.CategoricalColumn, + Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]: """ Convert a categorical column to a Series instance """ @@ -742,7 +754,8 @@ def _protocol_to_cudf_column_categorical( ) categories = as_column(mapping.values()) - codes_buffer, codes_dtype = col.get_buffers()["data"] + buffers = col.get_buffers() + codes_buffer, codes_dtype = buffers["data"] _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column( @@ -757,12 +770,13 @@ def _protocol_to_cudf_column_categorical( ordered=ordered, ) - return _set_missing_values(col, cudfcol), codes_buffer + return _set_missing_values(col, cudfcol), buffers def _protocol_to_cudf_column_string( - col: ColumnObject, -) -> Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]]: + col: _CuDFColumn, +) -> Tuple[cudf.core.column.StringColumn, + Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]: """ Convert a string ColumnObject to cudf Column object. """ diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index fb96c6b3698..b97ea950cee 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -26,7 +26,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): col_from_buf = build_column( Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype) ) - # check that non null values are the equals as null are represented + # check that non null values are the equals as nulls are represented # by sentinel values in the buffer. non_null_idxs = cudf.Series(cudfcol) != cudf.NA assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) From ac1ceb846d15e9ef9012fadac36669405ef614ce Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 9 Nov 2021 21:34:46 +0000 Subject: [PATCH 47/60] fix mypy errors --- python/cudf/cudf/core/df_protocol.py | 140 +++++++++++++++------------ 1 file changed, 79 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 9e2b8b33730..d8eb9d09dd3 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -4,11 +4,11 @@ Any, Dict, Iterable, + Mapping, Optional, Sequence, Tuple, - Union, - Mapping + cast, ) import cupy as cp @@ -244,7 +244,10 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: kind = _k.CATEGORICAL # Codes and categories' dtypes are different. # We use codes' dtype as these are stored in the buffer. - dtype = self._col.codes.dtype + codes = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes + dtype = codes.dtype else: raise ValueError( f"Data type {dtype} not supported by exchange protocol" @@ -282,12 +285,12 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: "`describe_categorical only works on " "a column with categorical dtype!" ) - - ordered = self._col.dtype.ordered + categ_col = cast(cudf.core.column.CategoricalColumn, self._col) + ordered = bool(categ_col.dtype.ordered) is_dictionary = True # NOTE: this shows the children approach is better, transforming # `categories` to a "mapping" dict is inefficient - categories = self._col.categories + categories = categ_col.categories mapping = {ix: val for ix, val in enumerate(categories.values_host)} return ordered, is_dictionary, mapping @@ -354,7 +357,9 @@ def get_chunks( """ return (self,) - def get_buffers(self) -> Mapping[str, Union[Tuple[_CuDFBuffer, ProtoDtype], None]]: + def get_buffers( + self, + ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]: """ Return a dictionary containing the underlying buffers. @@ -389,48 +394,39 @@ def get_buffers(self) -> Mapping[str, Union[Tuple[_CuDFBuffer, ProtoDtype], None return buffers - def _get_data_buffer( - self, - ) -> Tuple[_CuDFBuffer, ProtoDtype]: + def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the data and the buffer's associated dtype. """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _CuDFBuffer( - self._col.data, self._col.dtype, allow_copy=self._allow_copy - ) + col_data = self._col dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: - codes = self._col.codes - buffer = _CuDFBuffer( - self._col.codes.data, - self._col.codes.dtype, - allow_copy=self._allow_copy, - ) - dtype = self._dtype_from_cudfdtype(codes.dtype) + col_data = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes + dtype = self._dtype_from_cudfdtype(col_data.dtype) elif self.dtype[0] == _k.STRING: - encoded_string = self._col.children[1] - buffer = _CuDFBuffer( - encoded_string.data, - encoded_string.dtype, - allow_copy=self._allow_copy, - ) - dtype = self._dtype_from_cudfdtype(encoded_string.dtype) + col_data = self._col.children[1] + dtype = self._dtype_from_cudfdtype(col_data.dtype) else: raise NotImplementedError( f"Data type {self._col.dtype} not handled yet" ) + assert (col_data is not None) and (col_data.data is not None), " " + f"col_data(.data) should not be None when dtype = {dtype}" + buffer = _CuDFBuffer( + col_data.data, col_data.dtype, allow_copy=self._allow_copy + ) return buffer, dtype - def _get_validity_buffer( - self, - ) -> Tuple[_CuDFBuffer, ProtoDtype]: + def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -442,17 +438,22 @@ def _get_validity_buffer( if null == 3: _k = _DtypeKind if self.dtype[0] == _k.CATEGORICAL: - buffer = _CuDFBuffer( - self._col.codes._get_mask_as_column().data, - cp.uint8, - allow_copy=self._allow_copy, - ) + valid_mask = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes._get_mask_as_column() else: - buffer = _CuDFBuffer( - self._col._get_mask_as_column().data, - cp.uint8, - allow_copy=self._allow_copy, - ) + valid_mask = self._col._get_mask_as_column() + + # if (valid_mask is None) or (valid_mask.data is None) : + # raise RuntimeError("valid_mask and valid_mask.data" + # " should not be None when _CuDFColumn.describe_null[0] = 3") + assert (valid_mask is not None) and ( + valid_mask.data is not None + ), "valid_mask(.data) should not be None when " + "_CuDFColumn.describe_null[0] = 3" + buffer = _CuDFBuffer( + valid_mask.data, cp.uint8, allow_copy=self._allow_copy + ) dtype = (_k.UINT, 8, "C", "=") return buffer, dtype @@ -470,9 +471,7 @@ def _get_validity_buffer( f"See {self.__class__.__name__}.describe_null method." ) - def _get_offsets_buffer( - self, - ) -> Tuple[_CuDFBuffer, ProtoDtype]: + def _get_offsets_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -484,6 +483,9 @@ def _get_offsets_buffer( _k = _DtypeKind if self.dtype[0] == _k.STRING: offsets = self._col.children[0] + assert (offsets is not None) and (offsets.data is not None), " " + "offsets(.data) should not be None for string column" + buffer = _CuDFBuffer( offsets.data, offsets.dtype, allow_copy=self._allow_copy ) @@ -545,17 +547,18 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn(as_column(self._df.iloc[:, i]), - allow_copy=self._allow_copy) + return _CuDFColumn( + as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy + ) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn(as_column(self._df[name]), - allow_copy=self._allow_copy) + return _CuDFColumn( + as_column(self._df[name]), allow_copy=self._allow_copy + ) def get_columns(self) -> Iterable[_CuDFColumn]: return [ - _CuDFColumn(as_column(self._df[name]), - allow_copy=self._allow_copy) + _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy) for name in self._df.columns ] @@ -687,23 +690,26 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: def _protocol_to_cudf_column_numeric( col: _CuDFColumn, -) -> Tuple[cudf.core.column.NumericalColumn, - Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]: +) -> Tuple[ + cudf.core.column.ColumnBase, + Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], +]: """ Convert an int, uint, float or bool protocol column to the corresponding cudf column """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - + buffers = col.get_buffers() + assert buffers["data"] is not None, "data buffer should not be None" _dbuffer, _ddtype = buffers["data"] _check_buffer_is_on_gpu(_dbuffer) - dcol = build_column( + cudfcol_num = build_column( Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_cupy_dtype(_ddtype), ) - return _set_missing_values(col, dcol), buffers + return _set_missing_values(col, cudfcol_num), buffers def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: @@ -716,6 +722,12 @@ def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: "Set `allow_copy=True` to allow it." ) + elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy: + raise NotImplementedError( + "Only cuDF/GPU dataframes are supported for now." + "CPU (like `Pandas`) dataframes will be supported shortly." + ) + def _set_missing_values( protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase @@ -742,8 +754,10 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( col: _CuDFColumn, -) -> Tuple[cudf.core.column.CategoricalColumn, - Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]: +) -> Tuple[ + cudf.core.column.ColumnBase, + Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], +]: """ Convert a categorical column to a Series instance """ @@ -755,6 +769,7 @@ def _protocol_to_cudf_column_categorical( categories = as_column(mapping.values()) buffers = col.get_buffers() + assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) @@ -775,8 +790,10 @@ def _protocol_to_cudf_column_categorical( def _protocol_to_cudf_column_string( col: _CuDFColumn, -) -> Tuple[cudf.core.column.StringColumn, - Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]: +) -> Tuple[ + cudf.core.column.ColumnBase, + Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], +]: """ Convert a string ColumnObject to cudf Column object. """ @@ -784,6 +801,7 @@ def _protocol_to_cudf_column_string( buffers = col.get_buffers() # Retrieve the data buffer containing the UTF-8 code units + assert buffers["data"] is not None, "data buffer should never be None" data_buffer, data_dtype = buffers["data"] _check_buffer_is_on_gpu(data_buffer) encoded_string = build_column( @@ -793,6 +811,7 @@ def _protocol_to_cudf_column_string( # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and end of each string + assert buffers["offsets"] is not None, "not possible for string column" offset_buffer, offset_dtype = buffers["offsets"] _check_buffer_is_on_gpu(offset_buffer) offsets = build_column( @@ -800,8 +819,7 @@ def _protocol_to_cudf_column_string( protocol_dtype_to_cupy_dtype(offset_dtype), ) - col_str = build_column( + cudfcol_str = build_column( None, dtype=cp.dtype("O"), children=(offsets, encoded_string) ) - - return _set_missing_values(col, col_str), buffers + return _set_missing_values(col, cudfcol_str), buffers From e274ea27574f29f9af6258cc676e17dc8ce2545e Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 9 Nov 2021 23:33:19 +0000 Subject: [PATCH 48/60] fix last mypy errors --- python/cudf/cudf/core/df_protocol.py | 76 ++++++++++++++-------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index d8eb9d09dd3..0ec66d7fa12 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -10,7 +10,7 @@ Tuple, cast, ) - +import warnings import cupy as cp import numpy as np from numba.cuda import as_cuda_array @@ -381,7 +381,6 @@ def get_buffers( buffer. """ buffers = {} - buffers["data"] = self._get_data_buffer() try: buffers["validity"] = self._get_validity_buffer() except RuntimeError: @@ -392,41 +391,11 @@ def get_buffers( except RuntimeError: buffers["offsets"] = None - return buffers - - def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: - """ - Return the buffer containing the data and - the buffer's associated dtype. - """ - _k = _DtypeKind - if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - col_data = self._col - dtype = self.dtype - - elif self.dtype[0] == _k.CATEGORICAL: - col_data = cast( - cudf.core.column.CategoricalColumn, self._col - ).codes - dtype = self._dtype_from_cudfdtype(col_data.dtype) - - elif self.dtype[0] == _k.STRING: - col_data = self._col.children[1] - dtype = self._dtype_from_cudfdtype(col_data.dtype) - - else: - raise NotImplementedError( - f"Data type {self._col.dtype} not handled yet" - ) - assert (col_data is not None) and (col_data.data is not None), " " - f"col_data(.data) should not be None when dtype = {dtype}" - buffer = _CuDFBuffer( - col_data.data, col_data.dtype, allow_copy=self._allow_copy - ) + buffers["data"] = self._get_data_buffer() - return buffer, dtype + return buffers - def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: + def _get_validity_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -444,9 +413,6 @@ def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: else: valid_mask = self._col._get_mask_as_column() - # if (valid_mask is None) or (valid_mask.data is None) : - # raise RuntimeError("valid_mask and valid_mask.data" - # " should not be None when _CuDFColumn.describe_null[0] = 3") assert (valid_mask is not None) and ( valid_mask.data is not None ), "valid_mask(.data) should not be None when " @@ -471,7 +437,7 @@ def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: f"See {self.__class__.__name__}.describe_null method." ) - def _get_offsets_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: + def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -498,6 +464,38 @@ def _get_offsets_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: return buffer, dtype + def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: + """ + Return the buffer containing the data and + the buffer's associated dtype. + """ + _k = _DtypeKind + if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + col_data = self._col + dtype = self.dtype + + elif self.dtype[0] == _k.CATEGORICAL: + col_data = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes + dtype = self._dtype_from_cudfdtype(col_data.dtype) + + elif self.dtype[0] == _k.STRING: + col_data = self._col.children[1] + dtype = self._dtype_from_cudfdtype(col_data.dtype) + + else: + raise NotImplementedError( + f"Data type {self._col.dtype} not handled yet" + ) + assert (col_data is not None) and (col_data.data is not None), " " + f"col_data(.data) should not be None when dtype = {dtype}" + buffer = _CuDFBuffer( + col_data.data, col_data.dtype, allow_copy=self._allow_copy + ) + + return buffer, dtype + class _CuDFDataFrame: """ From 7aa325ed23b64f33d7cc85d22daafd4bea4dba34 Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 9 Nov 2021 23:41:36 +0000 Subject: [PATCH 49/60] rerun black,isort, flake8 --- python/cudf/cudf/core/df_protocol.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 0ec66d7fa12..966c6597f8b 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -10,7 +10,7 @@ Tuple, cast, ) -import warnings + import cupy as cp import numpy as np from numba.cuda import as_cuda_array @@ -395,7 +395,9 @@ def get_buffers( return buffers - def _get_validity_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + def _get_validity_buffer( + self, + ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. From 80bf86a104007faf437e2e471f4650150b956102 Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 9 Nov 2021 23:48:15 +0000 Subject: [PATCH 50/60] run isort on dataframe.py --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4988b422caf..c2c529320fc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -40,7 +40,7 @@ is_string_dtype, is_struct_dtype, ) -from cudf.core import column, reshape, df_protocol +from cudf.core import column, df_protocol, reshape from cudf.core.abc import Serializable from cudf.core.column import ( as_column, From 9a3549c6ec020b492ac61c525d71339d0335ab96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:14:00 +0100 Subject: [PATCH 51/60] Remove _DTypeKind alias _k Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 966c6597f8b..1f64074e2d2 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -44,15 +44,14 @@ class _Device(enum.IntEnum): ROCM = 10 -_k = _DtypeKind -_SUPPORTED_KINDS = ( - _k.INT, - _k.UINT, - _k.FLOAT, - _k.CATEGORICAL, - _k.BOOL, - _k.STRING, -) +_SUPPORTED_KINDS = { + _DtypeKind.INT, + _DtypeKind.UINT, + _DtypeKind.FLOAT, + _DtypeKind.CATEGORICAL, + _DtypeKind.BOOL, + _DtypeKind.STRING, +} ProtoDtype = Tuple[_DtypeKind, int, str, str] From b76d419f466f51cf0059e6b7ecc7a26a878ebd0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:15:32 +0100 Subject: [PATCH 52/60] Remove _DTypeKind alias _k Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 1f64074e2d2..41cbd748cee 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -226,15 +226,14 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) # not handled datetime and timedelta both map to datetime # (is timedelta handled?) - _k = _DtypeKind _np_kinds = { - "i": _k.INT, - "u": _k.UINT, - "f": _k.FLOAT, - "b": _k.BOOL, - "U": _k.STRING, - "M": _k.DATETIME, - "m": _k.DATETIME, + "i": _DtypeKind.INT, + "u": _DtypeKind.UINT, + "f": _DtypeKind.FLOAT, + "b": _DtypeKind.BOOL, + "U": _DtypeKind.STRING, + "M": _DtypeKind.DATETIME, + "m": _DtypeKind.DATETIME, } kind = _np_kinds.get(dtype.kind, None) if kind is None: From a34f1186138ea379c8aa74460129ce2746763ed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:16:04 +0100 Subject: [PATCH 53/60] Remove _DTypeKind alias _k Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 41cbd748cee..4f1e88f14b0 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -256,7 +256,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: bitwidth = dtype.itemsize * 8 format_str = dtype.str - endianness = dtype.byteorder if kind != _k.CATEGORICAL else "=" + endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "=" return (kind, bitwidth, format_str, endianness) @property From 99ca31d972c4b1a040fc31a81ee69e2f3d4eaf07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:16:32 +0100 Subject: [PATCH 54/60] Remove _DTypeKind alias _k Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 4f1e88f14b0..19dbef46eb0 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -405,8 +405,7 @@ def _get_validity_buffer( null, invalid = self.describe_null if null == 3: - _k = _DtypeKind - if self.dtype[0] == _k.CATEGORICAL: + if self.dtype[0] == _DtypeKind.CATEGORICAL: valid_mask = cast( cudf.core.column.CategoricalColumn, self._col ).codes._get_mask_as_column() From 87bba3236593b4fe9e51b5ba898a6f33b7f46ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:16:59 +0100 Subject: [PATCH 55/60] Remove _DTypeKind alias _k Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 19dbef46eb0..520944a085d 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -419,7 +419,7 @@ def _get_validity_buffer( buffer = _CuDFBuffer( valid_mask.data, cp.uint8, allow_copy=self._allow_copy ) - dtype = (_k.UINT, 8, "C", "=") + dtype = (_DtypeKind.UINT, 8, "C", "=") return buffer, dtype elif null == 1: From 535e56e37848307c78499eaec217140e6a591600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:29:03 +0100 Subject: [PATCH 56/60] add space to multi-line comment. Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 520944a085d..bb9e69b8c4f 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -721,7 +721,7 @@ def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy: raise NotImplementedError( - "Only cuDF/GPU dataframes are supported for now." + "Only cuDF/GPU dataframes are supported for now. " "CPU (like `Pandas`) dataframes will be supported shortly." ) From b421a2954cfc157f3ca48265168327b469257d0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Wed, 10 Nov 2021 07:29:54 +0100 Subject: [PATCH 57/60] Remove _DTypeKind alias _k Co-authored-by: Bradley Dice --- python/cudf/cudf/core/df_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index bb9e69b8c4f..26b69ee70f0 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -239,7 +239,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: if kind is None: # Not a NumPy/CuPy dtype. Check if it's a categorical maybe if isinstance(dtype, cudf.CategoricalDtype): - kind = _k.CATEGORICAL + kind = _DtypeKind.CATEGORICAL # Codes and categories' dtypes are different. # We use codes' dtype as these are stored in the buffer. codes = cast( From 6ae5ee0f0b00170df015db5b706eb954a9ecb4d0 Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 10 Nov 2021 11:27:37 +0000 Subject: [PATCH 58/60] remove remaining _DtypeKind aliases --- python/cudf/cudf/core/df_protocol.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 26b69ee70f0..8f258ce27b2 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -445,8 +445,7 @@ def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: Raises RuntimeError if the data buffer does not have an associated offsets buffer. """ - _k = _DtypeKind - if self.dtype[0] == _k.STRING: + if self.dtype[0] == _DtypeKind.STRING: offsets = self._col.children[0] assert (offsets is not None) and (offsets.data is not None), " " "offsets(.data) should not be None for string column" @@ -468,18 +467,22 @@ def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: Return the buffer containing the data and the buffer's associated dtype. """ - _k = _DtypeKind - if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + if self.dtype[0] in ( + _DtypeKind.INT, + _DtypeKind.UINT, + _DtypeKind.FLOAT, + _DtypeKind.BOOL, + ): col_data = self._col dtype = self.dtype - elif self.dtype[0] == _k.CATEGORICAL: + elif self.dtype[0] == _DtypeKind.CATEGORICAL: col_data = cast( cudf.core.column.CategoricalColumn, self._col ).codes dtype = self._dtype_from_cudfdtype(col_data.dtype) - elif self.dtype[0] == _k.STRING: + elif self.dtype[0] == _DtypeKind.STRING: col_data = self._col.children[1] dtype = self._dtype_from_cudfdtype(col_data.dtype) @@ -659,18 +662,22 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: # We need a dict of columns here, with each column being a cudf column. columns = dict() - _k = _DtypeKind _buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): col = df.get_column_by_name(name) - if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + if col.dtype[0] in ( + _DtypeKind.INT, + _DtypeKind.UINT, + _DtypeKind.FLOAT, + _DtypeKind.BOOL, + ): columns[name], _buf = _protocol_to_cudf_column_numeric(col) - elif col.dtype[0] == _k.CATEGORICAL: + elif col.dtype[0] == _DtypeKind.CATEGORICAL: columns[name], _buf = _protocol_to_cudf_column_categorical(col) - elif col.dtype[0] == _k.STRING: + elif col.dtype[0] == _DtypeKind.STRING: columns[name], _buf = _protocol_to_cudf_column_string(col) else: From 581153fc413ef18dad14072ef6131c52bf17772d Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 10 Nov 2021 11:31:37 +0000 Subject: [PATCH 59/60] import DataFrameObject from df_protocol --- python/cudf/cudf/tests/test_df_protocol.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index b97ea950cee..4408cafa80f 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -8,6 +8,7 @@ from cudf.core.buffer import Buffer from cudf.core.column import build_column from cudf.core.df_protocol import ( + DataFrameObject, _CuDFBuffer, _CuDFColumn, _DtypeKind, @@ -16,8 +17,6 @@ ) from cudf.testing._utils import assert_eq -DataFrameObject = Any - def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype From c1231cbc3ae62128167ab2ee9da462bd5b0b2dba Mon Sep 17 00:00:00 2001 From: iskode Date: Wed, 10 Nov 2021 11:58:56 +0000 Subject: [PATCH 60/60] rename assertion methods --- python/cudf/cudf/tests/test_df_protocol.py | 40 +++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 4408cafa80f..d24c8ca2860 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -79,7 +79,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): assert_column_equal(dfo.get_column_by_name(col), df[col]._column) -def _test_from_dataframe_equals(dfobj): +def assert_from_dataframe_equals(dfobj): df2 = _from_dataframe(dfobj) assert_dataframe_equal(dfobj, df2) @@ -93,17 +93,17 @@ def _test_from_dataframe_equals(dfobj): raise TypeError(f"{type(dfobj._df)} not supported yet.") -def _test_from_dataframe_exception(dfobj): +def assert_from_dataframe_exception(dfobj): exception_msg = "This operation must copy data from CPU to GPU." " Set `allow_copy=True` to allow it." with pytest.raises(TypeError, match=exception_msg): _from_dataframe(dfobj) -def _test_datatype(data): +def assert_df_unique_dtype_cols(data): cdf = cudf.DataFrame(data=data) - _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) - _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) def test_from_dataframe(): @@ -115,12 +115,12 @@ def test_from_dataframe(): def test_int_dtype(): data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) - _test_datatype(data_int) + assert_df_unique_dtype_cols(data_int) def test_float_dtype(): data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) - _test_datatype(data_float) + assert_df_unique_dtype_cols(data_float) def test_categorical_dtype(): @@ -129,18 +129,18 @@ def test_categorical_dtype(): col = cdf.__dataframe__().get_column_by_name("A") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) - _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) def test_bool_dtype(): data_bool = dict(a=[True, True, False], b=[False, True, False]) - _test_datatype(data_bool) + assert_df_unique_dtype_cols(data_bool) def test_string_dtype(): data_string = dict(a=["a", "b", "cdef", "", "g"]) - _test_datatype(data_string) + assert_df_unique_dtype_cols(data_string) def test_mixed_dtype(): @@ -151,7 +151,7 @@ def test_mixed_dtype(): categorical=[5, 1, 5], string=["rapidsai-cudf ", "", "df protocol"], ) - _test_datatype(data_mixed) + assert_df_unique_dtype_cols(data_mixed) def test_NA_int_dtype(): @@ -160,7 +160,7 @@ def test_NA_int_dtype(): b=[9, 10, None, 7, 8], c=[6, 19, 20, 100, 1000], ) - _test_datatype(data_int) + assert_df_unique_dtype_cols(data_int) def test_NA_float_dtype(): @@ -169,7 +169,7 @@ def test_NA_float_dtype(): b=[9.7, 10.9, None, 7.8, 8.2], c=[6.1, 19.2, 20.3, 100.4, 1000.5], ) - _test_datatype(data_float) + assert_df_unique_dtype_cols(data_float) def test_NA_categorical_dtype(): @@ -184,13 +184,13 @@ def test_NA_categorical_dtype(): assert col.describe_null == (3, 0) assert col.num_chunks() == 1 assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) - _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=True)) def test_NA_bool_dtype(): data_bool = dict(a=[None, True, False], b=[False, None, None]) - _test_datatype(data_bool) + assert_df_unique_dtype_cols(data_bool) def test_NA_string_dtype(): @@ -204,8 +204,8 @@ def test_NA_string_dtype(): assert col.null_count == 1 assert col.describe_null == (3, 0) assert col.num_chunks() == 1 - _test_from_dataframe_equals(df.__dataframe__(allow_copy=False)) - _test_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=True)) def test_NA_mixed_dtype(): @@ -216,4 +216,4 @@ def test_NA_mixed_dtype(): categorical=[5, 1, 5, 3, None], string=[None, None, None, "df protocol", None], ) - _test_datatype(data_mixed) + assert_df_unique_dtype_cols(data_mixed)