From 8fc7e17b0b6356dfb70759bd19c19ca3a7bf68a8 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 19 Aug 2021 11:30:10 +0000
Subject: [PATCH 01/60] dataframe protocol implementation, support only int,
 float, categorical  without missing values

---
 python/cudf/cudf/core/dataframe.py         |   5 +
 python/cudf/cudf/core/df_protocol.py       | 672 +++++++++++++++++++++
 python/cudf/cudf/tests/test_df_protocol.py |  65 ++
 3 files changed, 742 insertions(+)
 create mode 100644 python/cudf/cudf/core/df_protocol.py
 create mode 100644 python/cudf/cudf/tests/test_df_protocol.py

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0aafae0a85b..4388ad20c53 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7683,3 +7683,8 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
                 pass
             else:
                 raise e
+
+from cudf.core.df_protocol import __dataframe__, from_dataframe
+
+DataFrame.__dataframe__ = __dataframe__
+DataFrame.from_dataframe = from_dataframe
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
new file mode 100644
index 00000000000..d252c881be0
--- /dev/null
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -0,0 +1,672 @@
+"""
+Implementation of the dataframe exchange protocol.
+
+Public API
+----------
+
+from_dataframe : construct a pandas.DataFrame from an input data frame which
+                 implements the exchange protocol
+
+Notes
+-----
+
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
+  do in pure Python. It's more general but definitely less friendly than having
+  ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
+  this is worth looking at again.
+
+"""
+
+import enum
+import collections
+import ctypes
+from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
+
+import cudf
+import numpy as np
+import cupy as cp
+import pandas._testing as tm
+import cudf.testing as testcase
+import pytest
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+
+
+def from_dataframe(df : DataFrameObject, copy: bool = False) :
+    """
+    Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
+    """
+    if isinstance(df, cudf.DataFrame):
+        return df
+
+    if not hasattr(df, '__dataframe__'):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(), copy=copy)
+
+
+def _from_dataframe(df : DataFrameObject, copy: bool = False) :
+    """
+    Note: not all cases are handled yet, only ones that can be implemented with
+    only Pandas. Later, we need to implement/test support for categoricals,
+    bit/byte masks, chunk handling, etc.
+    """
+    # Check number of chunks, if there's more than one we need to iterate
+    if df.num_chunks() > 1:
+        raise NotImplementedError
+
+    # We need a dict of columns here, with each column being a numpy array (at
+    # least for now, deal with non-numpy dtypes later).
+    columns = dict()
+    _k = _DtypeKind
+    for name in df.column_names():
+        col = df.get_column_by_name(name)
+        if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            # Simple numerical or bool dtype, turn into numpy array
+            columns[name] = convert_column_to_cupy_ndarray(col, copy=copy)
+        elif col.dtype[0] == _k.CATEGORICAL:
+            columns[name] = convert_categorical_column(col, copy=copy)
+            names = df.column_names()
+        else:
+            raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
+    
+    return cudf.DataFrame(columns)
+
+
+
+class _DtypeKind(enum.IntEnum):
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21   # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray:
+    """
+    Convert an int, uint, float or bool column to a numpy array
+    """
+    if col.offset != 0:
+        raise NotImplementedError("column.offset > 0 not handled yet")
+
+    if col.describe_null[0] not in (0, 1):
+        raise NotImplementedError("Null values represented as masks or "
+                                  "sentinel values not handled yet")
+
+    _buffer, _dtype = col.get_data_buffer()
+    return buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy)
+
+def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray:
+    if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
+        x = cp.fromDlpack(_buffer.__dlpack__())
+
+    elif copy == False:
+        raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
+
+    else:
+        x = _copy_buffer_to_gpu(_buffer, _dtype)
+
+    return x
+
+
+def _copy_buffer_to_gpu(_buffer, _dtype):
+    # Handle the dtype
+    kind = _dtype[0]
+    bitwidth = _dtype[1]
+    _k = _DtypeKind
+    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+        raise RuntimeError("Not a boolean, integer or floating-point dtype")
+
+    _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}
+    _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}
+    _floats = {32: np.float32, 64: np.float64}
+    _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
+    column_dtype = _np_dtypes[kind][bitwidth]
+
+    # No DLPack yet, so need to construct a new ndarray from the data pointer
+    # and size in the buffer plus the dtype on the column
+    ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
+    data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type))
+
+    # NOTE: `x` does not own its memory, so the caller of this function must
+    #       either make a copy or hold on to a reference of the column or
+    #       buffer! (not done yet, this is pretty awful ...)
+    x = np.ctypeslib.as_array(data_pointer,
+                              shape=(_buffer.bufsize // (bitwidth//8),))
+    return cp.array(x, dtype=column_dtype)
+
+
+def convert_categorical_column(col : ColumnObject, copy:bool=False) :
+    """
+    Convert a categorical column to a Series instance
+    """
+    
+
+    ordered, is_dict, mapping = col.describe_categorical
+    if not is_dict:
+        raise NotImplementedError('Non-dictionary categoricals not supported yet')
+
+    # If you want to cheat for testing (can't use `_col` in real-world code):
+    #    categories = col._col.values.categories.values
+    #    codes = col._col.values.codes
+    categories = cp.asarray(list(mapping.values()))
+    codes_buffer, codes_dtype = col.get_data_buffer()
+    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy)
+    values = categories[codes]
+
+    # Seems like Pandas can only construct with non-null values, so need to
+    # null out the nulls later
+    cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered)
+    series = cudf.Series(cat)
+    null_kind = col.describe_null[0]
+    if null_kind == 2:  # sentinel value
+        sentinel = col.describe_null[1]
+        series[codes == sentinel] = None
+    else:
+        raise NotImplementedError("Only categorical columns with sentinel "
+                                  "value supported at the moment")
+
+    return series
+
+
+def __dataframe__(self, nan_as_null : bool = False) -> dict:
+    """
+    , target_device:str = 'gpu'
+    The public method to attach to cudf.DataFrame
+
+    We'll attach it via monkeypatching here for demo purposes. If Pandas adopt
+    the protocol, this will be a regular method on pandas.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+
+    ``target_device`` specifies the device where the returned dataframe protocol
+    object will live. Only `cpu` and `gpu` are supported for now.
+    """
+    # if target_device not in ['cpu', 'gpu']:
+    #     raise TypeError (f'Device {device} support not handle.')
+
+    # if device == 'cpu':
+    #     raise TypeError("This operation will copy data from GPU to CPU. Set `copy=True` to allow it.")
+
+
+    return _CuDFDataFrame(self, nan_as_null=nan_as_null)
+
+
+# Monkeypatch the Pandas DataFrame class to support the interchange protocol
+# cudf.DataFrame.__dataframe__ = __dataframe__
+
+
+# Implementation of interchange protocol
+# --------------------------------------
+
+class _CuDFBuffer:
+
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
+    
+
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(self, x : cp.ndarray) -> None:
+        """
+        Handle only regular columns (= cupy arrays) for now.
+        """
+        if not x.strides == (x.dtype.itemsize,):
+            # Array is not contiguous - this is possible to get in Pandas,
+            # there was some discussion on whether to support it. Some extra
+            # complexity for libraries that don't support it (e.g. Arrow),
+            # but would help with cupy-based libraries like CuDF.
+            raise RuntimeError("Design needs fixing - non-contiguous buffer")
+
+        # Store the numpy array in which the data resides as a private
+        # attribute, so we can use it to retrieve the public attributes
+        self._x = x
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes
+        """
+        return self._x.data.mem.size
+        # return self._x.size * self._x.dtype.itemsize
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer
+        """
+        # return self._x.data.mem.ptr
+        return self._x.__cuda_array_interface__['data'][0]
+
+    def __dlpack__(self):
+
+        """
+        Produce DLPack capsule (see array API standard).
+        Raises:
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
+        
+
+        DLPack implemented in CuPy
+        """
+        try: 
+            res = self._x.toDlpack()
+        except ValueError:
+            raise TypeError(f'dtype {self._x.dtype} unsupported by `dlpack`')
+
+        return res
+
+    def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
+
+        """
+        Device type and device ID for where the data in the buffer resides.
+        Uses device type codes matching DLPack. Enum members are::
+            - CPU = 1
+            - CUDA = 2
+            - CPU_PINNED = 3
+            - OPENCL = 4
+            - VULKAN = 7
+            - METAL = 8
+            - VPI = 9
+            - ROCM = 10
+        Note: must be implemented even if ``__dlpack__`` is not.
+        
+
+        Device type and device ID for where the data in the buffer resides.
+        """
+        class Device(enum.IntEnum):
+            CUDA = 2
+
+        return (Device.CUDA, self._x.device.id)
+
+    def __repr__(self) -> str:
+        return 'CuDFBuffer(' + str({'bufsize': self.bufsize,
+                                      'ptr': self.ptr,
+                                      'dlpack': self.__dlpack__(),
+                                      'device': self.__dlpack_device__()[0].name}
+                                      ) + ')'
+
+class _CuDFColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain either one
+    or two buffers - one data buffer and (depending on null representation) it
+    may have a mask buffer.
+
+     TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
+
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+
+    """
+
+    def __init__(self, column) -> None:
+        """
+        Note: doesn't deal with extension arrays yet, just assume a regular
+        Series/ndarray for now.
+        """
+        if not isinstance(column, cudf.Series):
+            raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
+
+        # Store the column as a private attribute
+        self._col = column
+
+    @property
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+        """
+        return self._col.size
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element. Always zero.
+        TODO: check `Always zero (in case of cudf)?`
+
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
+        """
+        return 0
+
+    @property
+    def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
+
+        Kind :
+
+            - INT = 0
+            - UINT = 1
+            - FLOAT = 2
+            - BOOL = 20
+            - STRING = 21   # UTF-8
+            - DATETIME = 22
+            - CATEGORICAL = 23
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for bit
+              masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the future
+              we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary, decimal,
+              and nested (list, struct, map, union) dtypes.
+        """
+        dtype = self._col.dtype
+        return self._dtype_from_cudfdtype(dtype)
+
+    def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
+        """
+        See `self.dtype` for details
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
+        #       datetime and timedelta both map to datetime (is timedelta handled?)
+        _k = _DtypeKind
+        _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL,
+                     'U': _k.STRING,
+                     'M': _k.DATETIME, 'm': _k.DATETIME}
+        kind = _np_kinds.get(dtype.kind, None)
+        if kind is None:
+            # Not a NumPy dtype. Check if it's a categorical maybe
+            # CuPy uses NumPy dtypes.
+            if isinstance(dtype, cudf.CategoricalDtype):
+                kind = 23
+                # Codes and categorical values dtypes are different.
+                # We use codes' dtype as these are stored in the buffer. 
+                dtype = self._col.cat.codes.dtype
+            else:
+                raise ValueError(f"Data type {dtype} not supported by exchange"
+                                 "protocol")
+
+        if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL):
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+        bitwidth = dtype.itemsize * 8
+        format_str = dtype.str
+        endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '='
+        return (kind, bitwidth, format_str, endianness)
+
+
+    @property
+    def describe_categorical(self) -> Dict[str, Any]:
+        """
+        If the dtype is categorical, there are two options:
+
+        - There are only values in the data buffer.
+        - There is a separate dictionary-style encoding for categorical values.
+
+        Raises RuntimeError if the dtype is not categorical
+
+        Content of returned dict:
+
+            - "is_ordered" : bool, whether the ordering of dictionary indices is
+                             semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+                          None if not a dictionary-style categorical.
+
+
+        TBD: are there any other in-memory representations that are needed?
+        """
+        if not self.dtype[0] == _DtypeKind.CATEGORICAL:
+            raise TypeError("`describe_categorical only works on a column with "
+                            "categorical dtype!")
+
+        ordered = self._col.dtype.ordered
+        is_dictionary = True
+        # NOTE: this shows the children approach is better, transforming
+        # `categories` to a "mapping" dict is inefficient
+        codes = self._col.cat.codes  # ndarray, length `self.size`
+        # categories.values is ndarray of length n_categories
+        categories = self._col.cat.categories
+        mapping = {ix: val for ix, val in enumerate(categories.values_host)}
+        return ordered, is_dictionary, mapping
+
+    @property
+    def describe_null(self) -> Tuple[int, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Kind:
+
+            - 0 : non-nullable
+            - 1 : NaN/NaT
+            - 2 : sentinel value
+            - 3 : bit mask
+            - 4 : byte mask
+
+        Value : if kind is "sentinel value", the actual value. None otherwise.
+        """
+        _k = _DtypeKind
+        kind = self.dtype[0]
+        value = None
+        if kind == _k.FLOAT:
+            null = 1  # np.nan
+        elif kind == _k.DATETIME:
+            null = 1  # np.datetime64('NaT')
+        elif kind in (_k.INT, _k.UINT, _k.BOOL):
+            # TODO: check if extension dtypes are used once support for them is
+            #       implemented in this procotol code
+            null = 0  # integer and boolean dtypes are non-nullable
+        elif kind == _k.CATEGORICAL:
+            # Null values for categoricals are stored as `-1` sentinel values
+            # in the category date (e.g., `col.cat.codes` is uint8 np.ndarray at least)
+            null = 2
+            value = -1
+        else:
+            raise NotImplementedError(f'Data type {self.dtype} not yet supported')
+
+        return null, value
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements. Should always be known.
+
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+        """
+        return self._col.isna().sum()
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+
+        TBC: Seems like chunks are used for parallel computation purpose in cudf:`apply_chunks`.
+        """
+        return 1
+
+    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']:
+        """
+        Return an iterator yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        return (self,)
+
+    def get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype tuple
+        """
+        Return the buffer containing the data.
+        """
+        _k = _DtypeKind
+        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            buffer = _CuDFBuffer(cp.array(self._col.to_gpu_array(), copy=False))
+            dtype = self.dtype
+        elif self.dtype[0] == _k.CATEGORICAL:
+            _, value = self.describe_null
+            codes = self._col.cat.codes
+            # handling null/NaN
+            buffer = _CuDFBuffer(cp.array(codes.fillna(100), copy=False))
+            dtype = self._dtype_from_cudfdtype(codes.dtype)
+        else:
+            raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
+
+        return buffer, dtype
+
+    def get_mask(self) -> _CuDFBuffer:
+        """
+        Return the buffer containing the mask values indicating missing data.
+
+        Raises RuntimeError if null representation is not a bit or byte mask.
+        """
+        null, value = self.describe_null
+        if null == 0:
+            msg = "This column is non-nullable so does not have a mask"
+        elif null == 1:
+            msg = "This column uses NaN as null so does not have a separate mask"
+        else:
+            raise NotImplementedError('See self.describe_null')
+
+        raise RuntimeError(msg)
+
+    # def get_children(self) -> Iterable[Column]:
+    #     """
+    #     Children columns underneath the column, each object in this iterator
+    #     must adhere to the column specification
+    #     """
+    #     pass
+
+class _CuDFDataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    Instances of this (private) class are returned from
+    ``cudf.DataFrame.__dataframe__`` as objects with the methods and
+    attributes defined on this class.
+    """
+    def __init__(self, df, nan_as_null : bool = False) -> None:
+        """
+        , device:str = 'gpu'
+        Constructor - an instance of this (private) class is returned from
+        `cudf.DataFrame.__dataframe__`.
+        """
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+        # This currently has no effect; once support for nullable extension
+        # dtypes is added, this value should be propagated to columns.
+        #
+        # ``device`` indicates the target device for the data.
+        self._nan_as_null = nan_as_null
+        self._df = df
+
+    def num_columns(self) -> int:
+        return len(self._df.columns)
+
+    def num_rows(self) -> int:
+        return len(self._df)
+
+    def num_chunks(self) -> int:
+        return 1
+
+    def column_names(self) -> Iterable[str]:
+        return self._df.columns.tolist()
+
+    def get_column(self, i: int) -> _CuDFColumn:
+        return _CuDFColumn(self._df.iloc[:, i])
+
+    def get_column_by_name(self, name: str) -> _CuDFColumn:
+        return _CuDFColumn(self._df[name])
+
+    def get_columns(self) -> Iterable[_CuDFColumn]:
+        return [_CuDFColumn(self._df[name]) for name in self._df.columns]
+
+    def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame':
+        if not isinstance(indices, collections.Sequence):
+            raise ValueError("`indices` is not a sequence")
+
+        return _CuDFDataFrame(self._df.iloc[:, indices])
+    
+    def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame':
+        """
+            Create a new DataFrame by selecting a subset of columns by name.
+
+            Don't use pandas.DataFrame `xs` method as :
+            def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
+            
+            Return cross-section from the Series/DataFrame.
+
+            This method takes a `key` argument to select data at a particular
+            level of a MultiIndex.
+        """
+        if not isinstance(names, collections.Sequence):
+            raise ValueError("`names` is not a sequence")
+
+        return _CuDFDataFrame(self._df.loc[:, names])
+
+    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']:
+        """
+        Return an iterator yielding the chunks.
+        """
+        return (self,)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
new file mode 100644
index 00000000000..ff720ac807c
--- /dev/null
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -0,0 +1,65 @@
+import datetime
+import cupy
+import numpy as np
+import pytest
+from cudf.core import df_protocol
+
+import cudf
+from cudf.testing import _utils as utils
+from cudf.testing._utils import (
+    ALL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+    does_not_raise,
+    gen_rand,
+)
+
+
+def _from_dataframe_equals(df, copy=False):
+    df2 = df_protocol._from_dataframe(df.__dataframe__(), copy=copy)
+    assert_eq(df, df2)
+
+def _from_dataframe_exception(df):
+    exception_msg = "This operation must copy data from CPU to GPU. Set `copy=True` to allow it."
+    with pytest.raises(TypeError, match=exception_msg):
+        df2 = from_dataframe(df, copy=False)
+
+def _datatype(data):
+    cdf = cudf.DataFrame(data=data)
+    _from_dataframe_equals(cdf, copy=False)
+    _from_dataframe_equals(cdf, copy=True)
+
+    
+def test_int_dtype():
+    data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
+    _datatype(data_int)
+
+def test_float_dtype():
+    data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
+    _datatype(data_float)
+
+def test_mixed_intfloat_dtype():
+    data_intfloat = dict(a=[1, 2, 3], b=[1.5, 2.5, 3.5])
+    _datatype(data_intfloat)
+
+def test_categorical_dtype():
+
+    def test__dataframe__(df):
+        # Some detailed testing for correctness of dtype:
+        col = df.__dataframe__().get_column_by_name('A')
+        assert col.dtype[0] == df_protocol._DtypeKind.CATEGORICAL
+        assert col.null_count == 0
+        assert col.num_chunks() == 1
+        assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+
+    cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    cdf["A"] = cdf["A"].astype("category")
+    test__dataframe__(cdf)
+    _from_dataframe_equals(cdf, copy=False)
+    _from_dataframe_equals(cdf, copy=True)
+
+# def test_bool_dtype():
+#     data_bool = dict(a=[True, True, False], b=[False, True, False])
+#     _datatype(data_bool)
\ No newline at end of file

From 4367d8f5e6c2b27eeed19111fb3d3a3f1e8713f2 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 19 Aug 2021 12:52:07 +0000
Subject: [PATCH 02/60] refactor to call from_dataframe on cudf directly  and
 __dataframe__() on the dataframe object

---
 python/cudf/cudf/__init__.py       |  1 +
 python/cudf/cudf/core/__init__.py  |  2 +-
 python/cudf/cudf/core/dataframe.py | 13 ++++++++++---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 13c20d8bcd4..112fbe118ad 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -41,6 +41,7 @@
     UInt64Index,
     cut,
     from_pandas,
+    from_dataframe,
     interval_range,
     merge,
 )
diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py
index 016aba2edb3..7e825d38b7f 100644
--- a/python/cudf/cudf/core/__init__.py
+++ b/python/cudf/cudf/core/__init__.py
@@ -2,7 +2,7 @@
 
 from cudf.core import _internals, buffer, column, column_accessor, common
 from cudf.core.buffer import Buffer
-from cudf.core.dataframe import DataFrame, from_pandas, merge
+from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe
 from cudf.core.index import (
     BaseIndex,
     CategoricalIndex,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4388ad20c53..2be9e37c35f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -53,6 +53,8 @@
     numeric_normalize_types,
 )
 from cudf.utils.utils import GetAttrGetItemMixin
+from cudf.core import df_protocol
+
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -7410,6 +7412,11 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
+    def __dataframe__(self, nan_as_null : bool = False):
+        return df_protocol.__dataframe__(self, nan_as_null=nan_as_null)
+    
+def from_dataframe(df, copy = False):
+    return df_protocol.from_dataframe(df, copy=copy)
 
 def from_pandas(obj, nan_as_null=None):
     """
@@ -7684,7 +7691,7 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
             else:
                 raise e
 
-from cudf.core.df_protocol import __dataframe__, from_dataframe
+# from cudf.core.df_protocol import __dataframe__, from_dataframe
 
-DataFrame.__dataframe__ = __dataframe__
-DataFrame.from_dataframe = from_dataframe
+# DataFrame.__dataframe__ = __dataframe__
+# DataFrame.from_dataframe = from_dataframe

From 331c69618f2c4391d5400904dc202c2bf7c14776 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 19 Aug 2021 14:41:14 +0000
Subject: [PATCH 03/60] remove commented monkeypatch

---
 python/cudf/cudf/core/dataframe.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2be9e37c35f..38b2141b987 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7690,8 +7690,3 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
                 pass
             else:
                 raise e
-
-# from cudf.core.df_protocol import __dataframe__, from_dataframe
-
-# DataFrame.__dataframe__ = __dataframe__
-# DataFrame.from_dataframe = from_dataframe

From c83b4d7669e9d980af7b70fa8d74930e02d64d97 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 19 Aug 2021 14:41:59 +0000
Subject: [PATCH 04/60] refactor test cases

---
 python/cudf/cudf/tests/test_df_protocol.py | 53 ++++++++++++++++------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index ff720ac807c..d27cb6d4a5d 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -2,7 +2,12 @@
 import cupy
 import numpy as np
 import pytest
-from cudf.core import df_protocol
+from cudf.core.df_protocol import (
+    _from_dataframe, 
+    _DtypeKind,
+    __dataframe__,
+    _CuDFDataFrame
+)
 
 import cudf
 from cudf.testing import _utils as utils
@@ -15,41 +20,59 @@
     does_not_raise,
     gen_rand,
 )
+import pandas as pd
+
+
+def _test_from_dataframe_equals(dfobj, copy=False):
+    df2 = _from_dataframe(dfobj, copy=copy)
+
+    if isinstance(dfobj._df, cudf.DataFrame):
+        assert_eq(dfobj._df, df2)
 
+    elif isinstance(dfobj._df, pd.DataFrame):
+        assert_eq(cudf.DataFrame(dfobj._df), df2)
 
-def _from_dataframe_equals(df, copy=False):
-    df2 = df_protocol._from_dataframe(df.__dataframe__(), copy=copy)
-    assert_eq(df, df2)
+    else:
+        raise TypeError(f"{type(dfobj._df)} not supported yet.")
 
-def _from_dataframe_exception(df):
+
+def _test_from_dataframe_exception(dfobj):
     exception_msg = "This operation must copy data from CPU to GPU. Set `copy=True` to allow it."
     with pytest.raises(TypeError, match=exception_msg):
-        df2 = from_dataframe(df, copy=False)
+        df2 = _from_dataframe(dfobj, copy=False)
 
-def _datatype(data):
+def _test_datatype(data):
     cdf = cudf.DataFrame(data=data)
-    _from_dataframe_equals(cdf, copy=False)
-    _from_dataframe_equals(cdf, copy=True)
+    cdfobj = cdf.__dataframe__()
+    print(cdfobj)
+    _test_from_dataframe_equals(cdfobj, copy=False)
+    _test_from_dataframe_equals(cdfobj, copy=True)
+
+    # pdf = pd.DataFrame(data=data)
+    # cpu_dfobj = _CuDFDataFrame(pdf)
+    # _test_from_dataframe_exception(cpu_dfobj)
+    # _test_from_dataframe_equals(cpu_dfobj, copy=True)
+    
 
     
 def test_int_dtype():
     data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
-    _datatype(data_int)
+    _test_datatype(data_int)
 
 def test_float_dtype():
     data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
-    _datatype(data_float)
+    _test_datatype(data_float)
 
 def test_mixed_intfloat_dtype():
     data_intfloat = dict(a=[1, 2, 3], b=[1.5, 2.5, 3.5])
-    _datatype(data_intfloat)
+    _test_datatype(data_intfloat)
 
 def test_categorical_dtype():
 
     def test__dataframe__(df):
         # Some detailed testing for correctness of dtype:
         col = df.__dataframe__().get_column_by_name('A')
-        assert col.dtype[0] == df_protocol._DtypeKind.CATEGORICAL
+        assert col.dtype[0] == _DtypeKind.CATEGORICAL
         assert col.null_count == 0
         assert col.num_chunks() == 1
         assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
@@ -57,8 +80,8 @@ def test__dataframe__(df):
     cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
     cdf["A"] = cdf["A"].astype("category")
     test__dataframe__(cdf)
-    _from_dataframe_equals(cdf, copy=False)
-    _from_dataframe_equals(cdf, copy=True)
+    _test_from_dataframe_equals(cdf.__dataframe__(), copy=False)
+    _test_from_dataframe_equals(cdf.__dataframe__(), copy=True)
 
 # def test_bool_dtype():
 #     data_bool = dict(a=[True, True, False], b=[False, True, False])

From defcbc57c48e4cd5891f34273ff584c7bd9debfe Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 19 Aug 2021 22:40:08 +0000
Subject: [PATCH 05/60] propagate nan_as_null from DataFrame to Column class

---
 python/cudf/cudf/core/df_protocol.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index d252c881be0..eae7bf9e9f7 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -348,7 +348,7 @@ class _CuDFColumn:
 
     """
 
-    def __init__(self, column) -> None:
+    def __init__(self, column, nan_as_null=False) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
@@ -358,6 +358,7 @@ def __init__(self, column) -> None:
 
         # Store the column as a private attribute
         self._col = column
+        self._nan_as_null = nan_as_null
 
     @property
     def size(self) -> int:

From 7c197205060e092bc097d669ef856e98718c0f89 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 26 Aug 2021 11:21:13 +0000
Subject: [PATCH 06/60] start missing value supports + int missing value tests

---
 python/cudf/cudf/core/df_protocol.py       | 69 ++++++++++++++--------
 python/cudf/cudf/tests/test_df_protocol.py |  9 +++
 2 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index eae7bf9e9f7..f4da1ab7efc 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -96,14 +96,15 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
-    if col.describe_null[0] not in (0, 1):
-        raise NotImplementedError("Null values represented as masks or "
-                                  "sentinel values not handled yet")
+    # if col.describe_null[0] not in (0, 1):
+    #     raise NotImplementedError("Null values represented as masks or "
+    #                               "sentinel values not handled yet")
 
     _buffer, _dtype = col.get_data_buffer()
-    return buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy)
+    _mask_buffer = col.get_mask()
+    return buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer, copy=copy)
 
-def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray:
+def buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer = None, copy : bool = False) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
         x = cp.fromDlpack(_buffer.__dlpack__())
 
@@ -158,7 +159,8 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     #    codes = col._col.values.codes
     categories = cp.asarray(list(mapping.values()))
     codes_buffer, codes_dtype = col.get_data_buffer()
-    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy)
+    _mask_buffer = col.get_mask()
+    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, _mask_buffer, copy=copy)
     values = categories[codes]
 
     # Seems like Pandas can only construct with non-null values, so need to
@@ -513,19 +515,21 @@ def describe_null(self) -> Tuple[int, Any]:
         _k = _DtypeKind
         kind = self.dtype[0]
         value = None
-        if kind == _k.FLOAT:
-            null = 1  # np.nan
-        elif kind == _k.DATETIME:
-            null = 1  # np.datetime64('NaT')
-        elif kind in (_k.INT, _k.UINT, _k.BOOL):
-            # TODO: check if extension dtypes are used once support for them is
-            #       implemented in this procotol code
-            null = 0  # integer and boolean dtypes are non-nullable
-        elif kind == _k.CATEGORICAL:
-            # Null values for categoricals are stored as `-1` sentinel values
-            # in the category date (e.g., `col.cat.codes` is uint8 np.ndarray at least)
-            null = 2
-            value = -1
+        if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
+            null = 3
+        # if kind == _k.FLOAT:
+        #     null = 1  # np.nan
+        # elif kind == _k.DATETIME:
+        #     null = 1  # np.datetime64('NaT')
+        # elif kind in (_k.INT, _k.UINT, _k.BOOL):
+        #     # TODO: check if extension dtypes are used once support for them is
+        #     #       implemented in this procotol code
+        #     null = 0  # integer and boolean dtypes are non-nullable
+        # elif kind == _k.CATEGORICAL:
+        #     # Null values for categoricals are stored as `-1` sentinel values
+        #     # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
+        #     null = 2
+        #     value = -1
         else:
             raise NotImplementedError(f'Data type {self.dtype} not yet supported')
 
@@ -562,13 +566,13 @@ def get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype t
         """
         _k = _DtypeKind
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _CuDFBuffer(cp.array(self._col.to_gpu_array(), copy=False))
+            buffer = _CuDFBuffer(cp.array(self._col.fillna(0).to_gpu_array(), copy=False))
             dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
             _, value = self.describe_null
             codes = self._col.cat.codes
             # handling null/NaN
-            buffer = _CuDFBuffer(cp.array(codes.fillna(100), copy=False))
+            buffer = _CuDFBuffer(cp.array(codes.fillna(0), copy=False))
             dtype = self._dtype_from_cudfdtype(codes.dtype)
         else:
             raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
@@ -582,14 +586,27 @@ def get_mask(self) -> _CuDFBuffer:
         Raises RuntimeError if null representation is not a bit or byte mask.
         """
         null, value = self.describe_null
+        buffer = None
         if null == 0:
             msg = "This column is non-nullable so does not have a mask"
         elif null == 1:
             msg = "This column uses NaN as null so does not have a separate mask"
+
+        elif null == 3:
+            
+            _k = _DtypeKind
+            if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+                buffer = _CuDFBuffer(cp.array(self._col.nullmask, copy=False))
+            elif self.dtype[0] == _k.CATEGORICAL:
+                codes = self._col.cat.codes.nullmask
+                # handling null/NaN
+                buffer = _CuDFBuffer(cp.array(codes, copy=False))
+
         else:
             raise NotImplementedError('See self.describe_null')
 
-        raise RuntimeError(msg)
+        return buffer
+
 
     # def get_children(self) -> Iterable[Column]:
     #     """
@@ -635,13 +652,13 @@ def column_names(self) -> Iterable[str]:
         return self._df.columns.tolist()
 
     def get_column(self, i: int) -> _CuDFColumn:
-        return _CuDFColumn(self._df.iloc[:, i])
+        return _CuDFColumn(self._df.iloc[:, i], self._nan_as_null)
 
     def get_column_by_name(self, name: str) -> _CuDFColumn:
-        return _CuDFColumn(self._df[name])
+        return _CuDFColumn(self._df[name], self._nan_as_null)
 
     def get_columns(self) -> Iterable[_CuDFColumn]:
-        return [_CuDFColumn(self._df[name]) for name in self._df.columns]
+        return [_CuDFColumn(self._df[name], self._nan_as_null) for name in self._df.columns]
 
     def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame':
         if not isinstance(indices, collections.Sequence):
@@ -664,7 +681,7 @@ def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
         if not isinstance(names, collections.Sequence):
             raise ValueError("`names` is not a sequence")
 
-        return _CuDFDataFrame(self._df.loc[:, names])
+        return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null)
 
     def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']:
         """
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index d27cb6d4a5d..99917b0a2a8 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -83,6 +83,15 @@ def test__dataframe__(df):
     _test_from_dataframe_equals(cdf.__dataframe__(), copy=False)
     _test_from_dataframe_equals(cdf.__dataframe__(), copy=True)
 
+def test_NA_int_dtype():
+    data_int = dict(a=[1, None, 3], b=[9, 10, None])
+    _test_datatype(data_int)
+
+# def test_NA2_int_dtype():
+#     data_int = dict(a=[1, None, 3, None, 5], b=[9, 10, None, 7, 8])
+#     _test_datatype(data_int)
+
+
 # def test_bool_dtype():
 #     data_bool = dict(a=[True, True, False], b=[False, True, False])
 #     _datatype(data_bool)
\ No newline at end of file

From 89d00f2c24bedb84ec422d0104f1644e1f749584 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Mon, 13 Sep 2021 11:38:04 +0000
Subject: [PATCH 07/60] apply protocol update changes

---
 python/cudf/cudf/core/df_protocol.py | 404 +++++++++++++++------------
 1 file changed, 219 insertions(+), 185 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index f4da1ab7efc..ba5291fb08d 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -100,8 +100,8 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n
     #     raise NotImplementedError("Null values represented as masks or "
     #                               "sentinel values not handled yet")
 
-    _buffer, _dtype = col.get_data_buffer()
-    _mask_buffer = col.get_mask()
+    _buffer, _dtype = col.get_buffers()['data']
+    _mask_buffer = col.get_buffers()['validity']
     return buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer, copy=copy)
 
 def buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer = None, copy : bool = False) -> cp.ndarray:
@@ -158,8 +158,8 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     #    categories = col._col.values.categories.values
     #    codes = col._col.values.codes
     categories = cp.asarray(list(mapping.values()))
-    codes_buffer, codes_dtype = col.get_data_buffer()
-    _mask_buffer = col.get_mask()
+    codes_buffer, codes_dtype = col.get_buffers()['data']
+    _mask_buffer = col.get_buffers()['validity']
     codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, _mask_buffer, copy=copy)
     values = categories[codes]
 
@@ -167,13 +167,18 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     # null out the nulls later
     cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered)
     series = cudf.Series(cat)
+
     null_kind = col.describe_null[0]
-    if null_kind == 2:  # sentinel value
-        sentinel = col.describe_null[1]
-        series[codes == sentinel] = None
-    else:
-        raise NotImplementedError("Only categorical columns with sentinel "
-                                  "value supported at the moment")
+    if null_kind != 0:
+        print(null_kind)
+        if null_kind == 2:  # sentinel value
+            sentinel = col.describe_null[1]
+            series[codes == sentinel] = None
+        elif null_kind == 3:
+            pass
+        else:
+            raise NotImplementedError("Only categorical columns with sentinel "
+                                    "value supported at the moment")
 
     return series
 
@@ -203,7 +208,6 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
 
     return _CuDFDataFrame(self, nan_as_null=nan_as_null)
 
-
 # Monkeypatch the Pandas DataFrame class to support the interchange protocol
 # cudf.DataFrame.__dataframe__ = __dataframe__
 
@@ -212,32 +216,22 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
 # --------------------------------------
 
 class _CuDFBuffer:
-
     """
-    Data in the buffer is guaranteed to be contiguous in memory.
-    Note that there is no dtype attribute present, a buffer can be thought of
-    as simply a block of memory. However, if the column that the buffer is
-    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
-    implemented, then that dtype information will be contained in the return
-    value from ``__dlpack__``.
-    This distinction is useful to support both data exchange via DLPack on a
-    buffer and (b) dtypes like variable-length strings which do not have a
-    fixed number of bytes per element.
-    
-
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, x : cp.ndarray) -> None:
+    def __init__(self, x : cp.ndarray, allow_copy : bool = True) -> None:
         """
-        Handle only regular columns (= cupy arrays) for now.
+        Handle only regular columns (= numpy arrays) for now.
         """
         if not x.strides == (x.dtype.itemsize,):
-            # Array is not contiguous - this is possible to get in Pandas,
-            # there was some discussion on whether to support it. Some extra
-            # complexity for libraries that don't support it (e.g. Arrow),
-            # but would help with cupy-based libraries like CuDF.
-            raise RuntimeError("Design needs fixing - non-contiguous buffer")
+            # The protocol does not support strided buffers, so a copy is
+            # necessary. If that's not allowed, we need to raise an exception.
+            if allow_copy:
+                x = x.copy()
+            else:
+                raise RuntimeError("Exports cannot be zero-copy in the case "
+                                   "of a non-contiguous buffer")
 
         # Store the numpy array in which the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
@@ -246,7 +240,7 @@ def __init__(self, x : cp.ndarray) -> None:
     @property
     def bufsize(self) -> int:
         """
-        Buffer size in bytes
+        Buffer size in bytes.
         """
         return self._x.data.mem.size
         # return self._x.size * self._x.dtype.itemsize
@@ -254,23 +248,13 @@ def bufsize(self) -> int:
     @property
     def ptr(self) -> int:
         """
-        Pointer to start of the buffer as an integer
+        Pointer to start of the buffer as an integer.
         """
-        # return self._x.data.mem.ptr
         return self._x.__cuda_array_interface__['data'][0]
 
     def __dlpack__(self):
-
         """
-        Produce DLPack capsule (see array API standard).
-        Raises:
-            - TypeError : if the buffer contains unsupported dtypes.
-            - NotImplementedError : if DLPack support is not implemented
-        Useful to have to connect to array libraries. Support optional because
-        it's not completely trivial to implement for a Python-only library.
-        
-
-        DLPack implemented in CuPy
+        DLPack not implemented in NumPy yet, so leave it out here.
         """
         try: 
             res = self._x.toDlpack()
@@ -280,25 +264,11 @@ def __dlpack__(self):
         return res
 
     def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
-
         """
-        Device type and device ID for where the data in the buffer resides.
-        Uses device type codes matching DLPack. Enum members are::
-            - CPU = 1
-            - CUDA = 2
-            - CPU_PINNED = 3
-            - OPENCL = 4
-            - VULKAN = 7
-            - METAL = 8
-            - VPI = 9
-            - ROCM = 10
-        Note: must be implemented even if ``__dlpack__`` is not.
-        
-
         Device type and device ID for where the data in the buffer resides.
         """
         class Device(enum.IntEnum):
-            CUDA = 2
+             CUDA = 2
 
         return (Device.CUDA, self._x.device.id)
 
@@ -314,61 +284,36 @@ class _CuDFColumn:
     A column object, with only the methods and properties required by the
     interchange protocol defined.
 
-    A column can contain one or more chunks. Each chunk can contain either one
-    or two buffers - one data buffer and (depending on null representation) it
-    may have a mask buffer.
-
-     TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
-         Instead, it seems to use "children" for both columns with a bit mask,
-         and for nested dtypes. Unclear whether this is elegant or confusing.
-         This design requires checking the null representation explicitly.
-         The Arrow design requires checking:
-         1. the ARROW_FLAG_NULLABLE (for sentinel values)
-         2. if a column has two children, combined with one of those children
-            having a null dtype.
-         Making the mask concept explicit seems useful. One null dtype would
-         not be enough to cover both bit and byte masks, so that would mean
-         even more checking if we did it the Arrow way.
-    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
-         multiple buffers per array (= column here). Semantically it may make
-         sense to have both: chunks were meant for example for lazy evaluation
-         of data which doesn't fit in memory, while multiple buffers per column
-         could also come from doing a selection operation on a single
-         contiguous buffer.
-         Given these concepts, one would expect chunks to be all of the same
-         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
-         while multiple buffers could have data-dependent lengths. Not an issue
-         in pandas if one column is backed by a single NumPy array, but in
-         Arrow it seems possible.
-         Are multiple chunks *and* multiple buffers per column necessary for
-         the purposes of this interchange protocol, or must producers either
-         reuse the chunk concept for this or copy the data?
-
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
 
     Note: this Column object can only be produced by ``__dataframe__``, so
           doesn't need its own version or ``__column__`` protocol.
 
     """
 
-    def __init__(self, column, nan_as_null=False) -> None:
+    def __init__(self, column,
+                 nan_as_null : bool = True, 
+                 allow_copy: bool = False) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
         """
         if not isinstance(column, cudf.Series):
-            raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
+            raise NotImplementedError("Columns of type {} not handled "
+                                      "yet".format(type(column)))
 
         # Store the column as a private attribute
         self._col = column
         self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
 
     @property
     def size(self) -> int:
         """
         Size of the column, in elements.
-
-        Corresponds to DataFrame.num_rows() if column is a single chunk;
-        equal to size of this current chunk otherwise.
         """
         return self._col.size
 
@@ -376,11 +321,6 @@ def size(self) -> int:
     def offset(self) -> int:
         """
         Offset of first element. Always zero.
-        TODO: check `Always zero (in case of cudf)?`
-
-        May be > 0 if using chunks; for example for a column with N chunks of
-        equal size M (only the last chunk may be shorter),
-        ``offset = n * M``, ``n = 0 .. N-1``.
         """
         return 0
 
@@ -425,23 +365,27 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
               and nested (list, struct, map, union) dtypes.
         """
         dtype = self._col.dtype
+
+        # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings
+        if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == 'O':
+            return (_DtypeKind.STRING, 8, 'u', '=')
+
         return self._dtype_from_cudfdtype(dtype)
 
     def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
         """
-        See `self.dtype` for details
+        See `self.dtype` for details.
         """
         # Note: 'c' (complex) not handled yet (not in array spec v1).
         #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
         #       datetime and timedelta both map to datetime (is timedelta handled?)
         _k = _DtypeKind
-        _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL,
-                     'U': _k.STRING,
-                     'M': _k.DATETIME, 'm': _k.DATETIME}
+        _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL,
+                     "U": _k.STRING,
+                     "M": _k.DATETIME, "m": _k.DATETIME}
         kind = _np_kinds.get(dtype.kind, None)
         if kind is None:
             # Not a NumPy dtype. Check if it's a categorical maybe
-            # CuPy uses NumPy dtypes.
             if isinstance(dtype, cudf.CategoricalDtype):
                 kind = 23
                 # Codes and categorical values dtypes are different.
@@ -451,7 +395,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
                 raise ValueError(f"Data type {dtype} not supported by exchange"
                                  "protocol")
 
-        if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL):
+        if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING):
             raise NotImplementedError(f"Data type {dtype} not handled yet")
 
         bitwidth = dtype.itemsize * 8
@@ -459,7 +403,6 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
         endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '='
         return (kind, bitwidth, format_str, endianness)
 
-
     @property
     def describe_categorical(self) -> Dict[str, Any]:
         """
@@ -478,9 +421,6 @@ def describe_categorical(self) -> Dict[str, Any]:
                                 categorical values to other objects exists
             - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
                           None if not a dictionary-style categorical.
-
-
-        TBD: are there any other in-memory representations that are needed?
         """
         if not self.dtype[0] == _DtypeKind.CATEGORICAL:
             raise TypeError("`describe_categorical only works on a column with "
@@ -510,28 +450,25 @@ def describe_null(self) -> Tuple[int, Any]:
             - 3 : bit mask
             - 4 : byte mask
 
-        Value : if kind is "sentinel value", the actual value. None otherwise.
+        Value : if kind is "sentinel value", the actual value.  If kind is a bit
+        mask or a byte mask, the value (0 or 1) indicating a missing value. None
+        otherwise.
         """
-        _k = _DtypeKind
-        kind = self.dtype[0]
-        value = None
-        if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
-            null = 3
-        # if kind == _k.FLOAT:
-        #     null = 1  # np.nan
-        # elif kind == _k.DATETIME:
-        #     null = 1  # np.datetime64('NaT')
-        # elif kind in (_k.INT, _k.UINT, _k.BOOL):
-        #     # TODO: check if extension dtypes are used once support for them is
-        #     #       implemented in this procotol code
-        #     null = 0  # integer and boolean dtypes are non-nullable
-        # elif kind == _k.CATEGORICAL:
-        #     # Null values for categoricals are stored as `-1` sentinel values
-        #     # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
-        #     null = 2
-        #     value = -1
-        else:
-            raise NotImplementedError(f'Data type {self.dtype} not yet supported')
+        if self.null_count == 0:
+            # there is no validity mask in this case
+            # so making it non-nullable (hackingly)
+            null = 0
+            value = None
+        else :
+            _k = _DtypeKind
+            kind = self.dtype[0]
+            # bit mask is universally used in cudf for missing
+            if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
+                        _k.STRING, _k.DATETIME):
+                null = 3
+                value = 0
+            else:
+                raise NotImplementedError(f"Data type {self.dtype} not yet supported")
 
         return null, value
 
@@ -539,16 +476,19 @@ def describe_null(self) -> Tuple[int, Any]:
     def null_count(self) -> int:
         """
         Number of null elements. Should always be known.
-
-        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
         """
         return self._col.isna().sum()
 
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        Store specific metadata of the column.
+        """
+        return {}
+
     def num_chunks(self) -> int:
         """
         Return the number of chunks the column consists of.
-
-        TBC: Seems like chunks are used for parallel computation purpose in cudf:`apply_chunks`.
         """
         return 1
 
@@ -560,60 +500,156 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']
         """
         return (self,)
 
-    def get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype tuple
+    def get_buffers(self) -> Dict[str, Any]:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        buffers = {}
+        buffers["data"] = self._get_data_buffer()
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except:
+            buffers["validity"] = None
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except:
+            buffers["offsets"] = None
+
+        return buffers
+
+    def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype tuple
         """
-        Return the buffer containing the data.
+        Return the buffer containing the data and the buffer's associated dtype.
         """
         _k = _DtypeKind
+        invalid = self.describe_null[1]
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _CuDFBuffer(cp.array(self._col.fillna(0).to_gpu_array(), copy=False))
+            buffer = _CuDFBuffer(
+                cp.array(self._col.fillna(invalid).to_gpu_array(), copy=False),
+                allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
-            _, value = self.describe_null
             codes = self._col.cat.codes
-            # handling null/NaN
-            buffer = _CuDFBuffer(cp.array(codes.fillna(0), copy=False))
+            buffer = _CuDFBuffer(
+                cp.array(codes.fillna(invalid), copy=False),
+                allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(codes.dtype)
+        # elif self.dtype[0] == _k.STRING:
+        #     # Marshal the strings from a NumPy object array into a byte array
+        #     buf = self._col.to_numpy()
+        #     b = bytearray()
+
+        #     # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
+        #     for i in range(buf.size):
+        #         if type(buf[i]) == str:
+        #             b.extend(buf[i].encode(encoding="utf-8"))
+
+        #     # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store
+        #     buffer = _CuDFBuffer(np.frombuffer(b, dtype="uint8"))
+
+        #     # Define the dtype for the returned buffer
+        #     dtype = (_k.STRING, 8, "u", "=")  # note: currently only support native endianness
         else:
             raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
 
         return buffer, dtype
 
-    def get_mask(self) -> _CuDFBuffer:
+    def unpackbits(myarray, bitorder="big"):
+    
+        bitorder_op = {"big": '(myarray[i / 8] >> (7 - i % 8)) & 1;', 
+                        "little": '(myarray[i / 8] >> (i % 8)) & 1;'}
+        operation = bitorder_op.get(bitorder, None)
+        if operation == None:
+            raise KeyError(f"bitorder must be either 'big' or 'little' not '{bitorder}'")
+        _unpackbits_kernel = _core.ElementwiseKernel(
+        'raw uint8 myarray', 'T unpacked',
+        'unpacked = '+ operation,
+        'unpackbits_kernel'
+        )
+
+        if myarray.dtype != cupy.uint8:
+            raise TypeError('Expected an input array of unsigned byte data type')
+
+        unpacked = cupy.ndarray((myarray.size * 8), dtype=cupy.uint8)
+        return _unpackbits_kernel(myarray, unpacked)
+
+    def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         """
-        Return the buffer containing the mask values indicating missing data.
+        Return the buffer containing the mask values indicating missing data and
+        the buffer's associated dtype.
 
         Raises RuntimeError if null representation is not a bit or byte mask.
         """
-        null, value = self.describe_null
-        buffer = None
-        if null == 0:
-            msg = "This column is non-nullable so does not have a mask"
-        elif null == 1:
-            msg = "This column uses NaN as null so does not have a separate mask"
-
-        elif null == 3:
-            
+        
+        null, invalid = self.describe_null
+        if null == 3:
             _k = _DtypeKind
-            if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-                buffer = _CuDFBuffer(cp.array(self._col.nullmask, copy=False))
-            elif self.dtype[0] == _k.CATEGORICAL:
-                codes = self._col.cat.codes.nullmask
-                # handling null/NaN
-                buffer = _CuDFBuffer(cp.array(codes, copy=False))
+            bitmask = unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)]
+            buffer = _CuDFBuffer(bitmask)
+            dtype = (_k.UINT, 8, "C", "=")
+            return buffer, dtype
 
+        elif null == 1:
+            msg = "This column uses NaN as null so does not have a separate mask"
+        elif null == 0:   
+            msg = "This column is non-nullable so does not have a mask"
         else:
-            raise NotImplementedError('See self.describe_null')
+            raise NotImplementedError("See self.describe_null")
 
-        return buffer
+        raise RuntimeError(msg)
 
+    def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
+        """
+        Return the buffer containing the offset values for variable-size binary
+        data (e.g., variable-length strings) and the buffer's associated dtype.
 
-    # def get_children(self) -> Iterable[Column]:
-    #     """
-    #     Children columns underneath the column, each object in this iterator
-    #     must adhere to the column specification
-    #     """
-    #     pass
+        Raises RuntimeError if the data buffer does not have an associated
+        offsets buffer.
+        """
+        _k = _DtypeKind
+        if self.dtype[0] == _k.STRING:
+            # For each string, we need to manually determine the next offset
+            values = self._col.to_numpy()
+            ptr = 0
+            offsets = [ptr]
+            for v in values:
+                # For missing values (in this case, `np.nan` values), we don't increment the pointer)
+                if type(v) == str:
+                    b = v.encode(encoding="utf-8")
+                    ptr += len(b)
+
+                offsets.append(ptr)
+
+            # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter)
+            buf = cp.asarray(offsets, dtype="int64")
+
+            # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store
+            buffer = _CuDFBuffer(buf)
+
+            # Assemble the buffer dtype info
+            dtype = (_k.INT, 64, 'l', "=")  # note: currently only support native endianness
+        else:
+            raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
+
+        return buffer, dtype
 
 class _CuDFDataFrame:
     """
@@ -624,20 +660,25 @@ class _CuDFDataFrame:
     ``cudf.DataFrame.__dataframe__`` as objects with the methods and
     attributes defined on this class.
     """
-    def __init__(self, df, nan_as_null : bool = False) -> None:
+    def __init__(self, df, nan_as_null : bool = True,
+                 allow_copy : bool = True) -> None:
         """
-        , device:str = 'gpu'
         Constructor - an instance of this (private) class is returned from
         `cudf.DataFrame.__dataframe__`.
         """
+        self._df = df
         # ``nan_as_null`` is a keyword intended for the consumer to tell the
         # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
         # This currently has no effect; once support for nullable extension
         # dtypes is added, this value should be propagated to columns.
-        #
-        # ``device`` indicates the target device for the data.
         self._nan_as_null = nan_as_null
-        self._df = df
+        self._allow_copy = allow_copy
+
+    @property
+    def metadata(self):
+        # `index` isn't a regular column, and the protocol doesn't support row
+        # labels - so we export it as Pandas-specific metadata here.
+        return {"cudf.index": self._df.index}
 
     def num_columns(self) -> int:
         return len(self._df.columns)
@@ -652,36 +693,29 @@ def column_names(self) -> Iterable[str]:
         return self._df.columns.tolist()
 
     def get_column(self, i: int) -> _CuDFColumn:
-        return _CuDFColumn(self._df.iloc[:, i], self._nan_as_null)
+        return _CuDFColumn(
+            self._df.iloc[:, i], allow_copy=self._allow_copy)
 
     def get_column_by_name(self, name: str) -> _CuDFColumn:
-        return _CuDFColumn(self._df[name], self._nan_as_null)
+        return _CuDFColumn(
+            self._df[name], allow_copy=self._allow_copy)
 
     def get_columns(self) -> Iterable[_CuDFColumn]:
-        return [_CuDFColumn(self._df[name], self._nan_as_null) for name in self._df.columns]
+        return [_CuDFColumn(self._df[name], allow_copy=self._allow_copy)
+                for name in self._df.columns]
 
     def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame':
         if not isinstance(indices, collections.Sequence):
             raise ValueError("`indices` is not a sequence")
 
         return _CuDFDataFrame(self._df.iloc[:, indices])
-    
-    def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame':
-        """
-            Create a new DataFrame by selecting a subset of columns by name.
-
-            Don't use pandas.DataFrame `xs` method as :
-            def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
-            
-            Return cross-section from the Series/DataFrame.
 
-            This method takes a `key` argument to select data at a particular
-            level of a MultiIndex.
-        """
+    def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame':
         if not isinstance(names, collections.Sequence):
             raise ValueError("`names` is not a sequence")
 
-        return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null)
+        return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null,
+                                self.allow_copy)
 
     def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']:
         """

From 8450d7edfd8d2217341de35ee11129327c8633ff Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Mon, 13 Sep 2021 17:56:07 +0000
Subject: [PATCH 08/60] missing values support for int, float and categorical

---
 python/cudf/cudf/core/df_protocol.py       | 84 +++++++++++-----------
 python/cudf/cudf/tests/test_df_protocol.py | 81 +++++++++++++++++----
 2 files changed, 108 insertions(+), 57 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index ba5291fb08d..b234cedc1b1 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -26,6 +26,7 @@
 import cudf
 import numpy as np
 import cupy as cp
+from cupy import _core
 import pandas._testing as tm
 import cudf.testing as testcase
 import pytest
@@ -37,7 +38,7 @@
 ColumnObject = Any
 
 
-def from_dataframe(df : DataFrameObject, copy: bool = False) :
+def from_dataframe(df : DataFrameObject, allow_copy: bool = False) :
     """
     Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
     """
@@ -47,14 +48,12 @@ def from_dataframe(df : DataFrameObject, copy: bool = False) :
     if not hasattr(df, '__dataframe__'):
         raise ValueError("`df` does not support __dataframe__")
 
-    return _from_dataframe(df.__dataframe__(), copy=copy)
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
 def _from_dataframe(df : DataFrameObject, copy: bool = False) :
     """
-    Note: not all cases are handled yet, only ones that can be implemented with
-    only Pandas. Later, we need to implement/test support for categoricals,
-    bit/byte masks, chunk handling, etc.
+    Create a cudf DataFrame object from DataFrameObject Interface.
     """
     # Check number of chunks, if there's more than one we need to iterate
     if df.num_chunks() > 1:
@@ -64,18 +63,22 @@ def _from_dataframe(df : DataFrameObject, copy: bool = False) :
     # least for now, deal with non-numpy dtypes later).
     columns = dict()
     _k = _DtypeKind
+    _buffers = []  # hold on to buffers, keeps memory alive
     for name in df.column_names():
         col = df.get_column_by_name(name)
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
             # Simple numerical or bool dtype, turn into numpy array
-            columns[name] = convert_column_to_cupy_ndarray(col, copy=copy)
+            columns[name], _buf = convert_column_to_cupy_ndarray(col, copy=copy)
         elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name] = convert_categorical_column(col, copy=copy)
-            names = df.column_names()
+            columns[name], _buf = convert_categorical_column(col, copy=copy)
         else:
             raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
-    
-    return cudf.DataFrame(columns)
+        
+        _buffers.append(_buf)
+
+    df_new = cudf.DataFrame(columns)
+    df_new._buffers = _buffers
+    return df_new
 
 
 
@@ -88,6 +91,16 @@ class _DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
+def set_missing_values(col, col_array):
+    series = cudf.Series(col_array)
+    null_kind, null_value = col.describe_null
+    if  null_kind != 0:
+        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
+        _mask_buffer, _mask_dtype = col.get_buffers()["validity"]
+        bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype)
+        series[bitmask==null_value] = None
+
+    return series
 
 def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray:
     """
@@ -96,15 +109,20 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
-    # if col.describe_null[0] not in (0, 1):
-    #     raise NotImplementedError("Null values represented as masks or "
-    #                               "sentinel values not handled yet")
-
     _buffer, _dtype = col.get_buffers()['data']
-    _mask_buffer = col.get_buffers()['validity']
-    return buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer, copy=copy)
+    if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
+        x = cp.fromDlpack(_buffer.__dlpack__())
+
+    elif copy == False:
+        raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
 
-def buffer_to_cupy_ndarray(_buffer, _dtype, _mask_buffer = None, copy : bool = False) -> cp.ndarray:
+    else:
+        x = _copy_buffer_to_gpu(_buffer, _dtype)
+
+    return set_missing_values(col, x), _buffer
+
+
+def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
         x = cp.fromDlpack(_buffer.__dlpack__())
 
@@ -148,8 +166,6 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     """
     Convert a categorical column to a Series instance
     """
-    
-
     ordered, is_dict, mapping = col.describe_categorical
     if not is_dict:
         raise NotImplementedError('Non-dictionary categoricals not supported yet')
@@ -159,28 +175,13 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     #    codes = col._col.values.codes
     categories = cp.asarray(list(mapping.values()))
     codes_buffer, codes_dtype = col.get_buffers()['data']
-    _mask_buffer = col.get_buffers()['validity']
-    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, _mask_buffer, copy=copy)
+    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy)
     values = categories[codes]
 
-    # Seems like Pandas can only construct with non-null values, so need to
+    # Seems like cudf can only construct with non-null values, so need to
     # null out the nulls later
     cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered)
-    series = cudf.Series(cat)
-
-    null_kind = col.describe_null[0]
-    if null_kind != 0:
-        print(null_kind)
-        if null_kind == 2:  # sentinel value
-            sentinel = col.describe_null[1]
-            series[codes == sentinel] = None
-        elif null_kind == 3:
-            pass
-        else:
-            raise NotImplementedError("Only categorical columns with sentinel "
-                                    "value supported at the moment")
-
-    return series
+    return set_missing_values(col, cat), codes_buffer
 
 
 def __dataframe__(self, nan_as_null : bool = False) -> dict:
@@ -243,7 +244,6 @@ def bufsize(self) -> int:
         Buffer size in bytes.
         """
         return self._x.data.mem.size
-        # return self._x.size * self._x.dtype.itemsize
 
     @property
     def ptr(self) -> int:
@@ -572,7 +572,7 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
 
         return buffer, dtype
 
-    def unpackbits(myarray, bitorder="big"):
+    def _unpackbits(self, myarray, bitorder="big"):
     
         bitorder_op = {"big": '(myarray[i / 8] >> (7 - i % 8)) & 1;', 
                         "little": '(myarray[i / 8] >> (i % 8)) & 1;'}
@@ -585,10 +585,10 @@ def unpackbits(myarray, bitorder="big"):
         'unpackbits_kernel'
         )
 
-        if myarray.dtype != cupy.uint8:
+        if myarray.dtype != cp.uint8:
             raise TypeError('Expected an input array of unsigned byte data type')
 
-        unpacked = cupy.ndarray((myarray.size * 8), dtype=cupy.uint8)
+        unpacked = cp.ndarray((myarray.size * 8), dtype=cp.uint8)
         return _unpackbits_kernel(myarray, unpacked)
 
     def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
@@ -602,7 +602,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
-            bitmask = unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)]
+            bitmask = self._unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)]
             buffer = _CuDFBuffer(bitmask)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 99917b0a2a8..d8c5c2a1d49 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -6,7 +6,9 @@
     _from_dataframe, 
     _DtypeKind,
     __dataframe__,
-    _CuDFDataFrame
+    _CuDFDataFrame,
+    _CuDFColumn,
+    _CuDFBuffer
 )
 
 import cudf
@@ -21,11 +23,44 @@
     gen_rand,
 )
 import pandas as pd
+from typing import Any, Tuple
+
+DataFrameObject = Any
+
+def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol:cudf.Series):
+    buf, dtype = buffer_dtype
+    assert buf.__dlpack_device__() == (2, 0)
+
+def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series):
+    assert col.size == cudfcol.size 
+    assert col.offset == 0
+    assert col.null_count == cudfcol.isna().sum() 
+    assert col.num_chunks() == 1
+    if col.null_count == 0 :
+        pytest.raises(RuntimeError, col._get_validity_buffer)
+    assert_buffer_equal(col._get_data_buffer(), cudfcol)
+    null_kind, null_value = col.describe_null
+    if col.null_count == 0:
+        assert null_kind == 0
+        assert null_value == None
+    else:
+        assert null_kind == 3
+        assert null_value == 0
+
+
+def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame):
+    assert dfo.num_columns() == len(df.columns)
+    assert dfo.num_rows() == len(df)
+    assert dfo.num_chunks() == 1
+    assert dfo.column_names() == list(df.columns)
+    for col in df.columns:
+        assert_column_equal(dfo.get_column_by_name(col), df[col])
 
 
 def _test_from_dataframe_equals(dfobj, copy=False):
     df2 = _from_dataframe(dfobj, copy=copy)
 
+    assert_dataframe_equal(dfobj, df2)
     if isinstance(dfobj._df, cudf.DataFrame):
         assert_eq(dfobj._df, df2)
 
@@ -68,28 +103,44 @@ def test_mixed_intfloat_dtype():
     _test_datatype(data_intfloat)
 
 def test_categorical_dtype():
-
-    def test__dataframe__(df):
-        # Some detailed testing for correctness of dtype:
-        col = df.__dataframe__().get_column_by_name('A')
-        assert col.dtype[0] == _DtypeKind.CATEGORICAL
-        assert col.null_count == 0
-        assert col.num_chunks() == 1
-        assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
-
     cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
     cdf["A"] = cdf["A"].astype("category")
-    test__dataframe__(cdf)
+    col = cdf.__dataframe__().get_column_by_name('A')
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
     _test_from_dataframe_equals(cdf.__dataframe__(), copy=False)
     _test_from_dataframe_equals(cdf.__dataframe__(), copy=True)
 
 def test_NA_int_dtype():
-    data_int = dict(a=[1, None, 3], b=[9, 10, None])
+    data_int = dict(a=[1, None, 3, None, 5], 
+                    b=[9, 10, None, 7, 8],
+                    c= [6, 19, 20, 100, 1000] )
     _test_datatype(data_int)
 
-# def test_NA2_int_dtype():
-#     data_int = dict(a=[1, None, 3, None, 5], b=[9, 10, None, 7, 8])
-#     _test_datatype(data_int)
+def test_NA_float_dtype():
+    data_float = dict(a=[1.4, None, 3.6, None, 5.2], 
+                    b=[9.7, 10.9, None, 7.8, 8.2],
+                    c= [6.1, 19.2, 20.3, 100.4, 1000.5] )
+    _test_datatype(data_float)
+
+def test_NA_categorical_dtype():
+    df = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    df["B"] = df["A"].astype("category")
+    df.at[[1, 3], 'B'] = None  # Set two items to null
+
+    # Some detailed testing for correctness of dtype and null handling:
+    col = df.__dataframe__().get_column_by_name('B')
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.null_count == 2
+    assert col.describe_null == (3, 0)  # sentinel value -1
+    assert col.num_chunks() == 1
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    _test_from_dataframe_equals(df.__dataframe__(), copy=True)
+    _test_from_dataframe_equals(df.__dataframe__(), copy=False)
+
+    # df2 = _from_dataframe(df.__dataframe__())
+    # assert_dataframe_equal(df.__dataframe__(), df)
+    # tm.assert_frame_equal(df, df2)
 
 
 # def test_bool_dtype():

From ec842d62ab4e553eb5b12a7f9ace215bb5c058ef Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 14 Sep 2021 08:40:35 +0000
Subject: [PATCH 09/60] add boolean support w/ missing values

---
 python/cudf/cudf/core/df_protocol.py       | 40 +++++++++++++++++-----
 python/cudf/cudf/tests/test_df_protocol.py | 10 ++++--
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index b234cedc1b1..ec6b9212fd5 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -110,21 +110,37 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _buffer, _dtype = col.get_buffers()['data']
-    if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
-        x = cp.fromDlpack(_buffer.__dlpack__())
+    x = buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy)
+    # if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
+    #     _k = _DtypeKind
+    #     print(f'buffer dtype: {_dtype[0]}')
+    #     if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, _k.BOOL):
+    #         x = cp.fromDlpack(_buffer.__dlpack__())
+    #         if _dtype[0] == _k.BOOL: 
+    #             print(f'before booleanizing: {x}')
+    #             x = x.astype(cp.bool_)
+    #             print(f'after booleanizing: {x}')
 
-    elif copy == False:
-        raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
 
-    else:
-        x = _copy_buffer_to_gpu(_buffer, _dtype)
+    # elif copy == False:
+    #     raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
+
+    # else:
+    #     x = _copy_buffer_to_gpu(_buffer, _dtype)
 
     return set_missing_values(col, x), _buffer
 
 
 def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
-        x = cp.fromDlpack(_buffer.__dlpack__())
+        _k = _DtypeKind
+        print(f'buffer dtype: {_dtype[0]}')
+        if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
+            x = cp.fromDlpack(_buffer.__dlpack__())
+        elif _dtype[0] == _k.BOOL: 
+            x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_)
+        else:
+            raise TypeError(f"dtype {_dtype[0]} not supported yet !")
 
     elif copy == False:
         raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
@@ -464,7 +480,7 @@ def describe_null(self) -> Tuple[int, Any]:
             kind = self.dtype[0]
             # bit mask is universally used in cudf for missing
             if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
-                        _k.STRING, _k.DATETIME):
+                        _k.BOOL, _k.STRING, _k.DATETIME):
                 null = 3
                 value = 0
             else:
@@ -541,11 +557,17 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
         """
         _k = _DtypeKind
         invalid = self.describe_null[1]
-        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT):
             buffer = _CuDFBuffer(
                 cp.array(self._col.fillna(invalid).to_gpu_array(), copy=False),
                 allow_copy=self._allow_copy)
             dtype = self.dtype
+        elif self.dtype[0] == _k.BOOL:
+            # convert bool to uint8 as dlpack does not support bool natively.
+            buffer = _CuDFBuffer(
+                cp.array(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8, copy=False),
+                allow_copy=self._allow_copy)
+            dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.cat.codes
             buffer = _CuDFBuffer(
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index d8c5c2a1d49..f214807d75b 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -143,6 +143,10 @@ def test_NA_categorical_dtype():
     # tm.assert_frame_equal(df, df2)
 
 
-# def test_bool_dtype():
-#     data_bool = dict(a=[True, True, False], b=[False, True, False])
-#     _datatype(data_bool)
\ No newline at end of file
+def test_bool_dtype():
+    data_bool = dict(a=[True, True, False], b=[False, True, False])
+    _test_datatype(data_bool)
+
+def test_NA_bool_dtype():
+    data_bool = dict(a=[None, True, False], b=[False, None, None])
+    _test_datatype(data_bool)
\ No newline at end of file

From 13e0b95c6b593e24222629d8f12885519f82938a Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 14 Sep 2021 09:09:59 +0000
Subject: [PATCH 10/60] refactor 'convert_column_to_cupy_ndarray' and replace
 'cp.array' to 'cp.asarray' to enforce zero-copy

---
 python/cudf/cudf/core/df_protocol.py       | 107 +++++++++------------
 python/cudf/cudf/tests/test_df_protocol.py |   5 -
 2 files changed, 44 insertions(+), 68 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index ec6b9212fd5..9aeb7188c9a 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -91,16 +91,6 @@ class _DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
-def set_missing_values(col, col_array):
-    series = cudf.Series(col_array)
-    null_kind, null_value = col.describe_null
-    if  null_kind != 0:
-        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
-        _mask_buffer, _mask_dtype = col.get_buffers()["validity"]
-        bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype)
-        series[bitmask==null_value] = None
-
-    return series
 
 def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray:
     """
@@ -111,47 +101,43 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n
 
     _buffer, _dtype = col.get_buffers()['data']
     x = buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy)
-    # if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
-    #     _k = _DtypeKind
-    #     print(f'buffer dtype: {_dtype[0]}')
-    #     if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL, _k.BOOL):
-    #         x = cp.fromDlpack(_buffer.__dlpack__())
-    #         if _dtype[0] == _k.BOOL: 
-    #             print(f'before booleanizing: {x}')
-    #             x = x.astype(cp.bool_)
-    #             print(f'after booleanizing: {x}')
-
-
-    # elif copy == False:
-    #     raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
-
-    # else:
-    #     x = _copy_buffer_to_gpu(_buffer, _dtype)
 
     return set_missing_values(col, x), _buffer
 
 
 def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
-        _k = _DtypeKind
-        print(f'buffer dtype: {_dtype[0]}')
-        if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
-            x = cp.fromDlpack(_buffer.__dlpack__())
-        elif _dtype[0] == _k.BOOL: 
-            x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_)
-        else:
-            raise TypeError(f"dtype {_dtype[0]} not supported yet !")
-
-    elif copy == False:
-        raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
-
+        x = _gpu_buffer_to_cupy(_buffer, _dtype)
     else:
-        x = _copy_buffer_to_gpu(_buffer, _dtype)
+        if not copy:
+            raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
+        x = _cpu_buffer_to_cupy(_buffer, _dtype)
 
     return x
 
+def set_missing_values(col, col_array):
+    series = cudf.Series(col_array)
+    null_kind, null_value = col.describe_null
+    if  null_kind != 0:
+        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
+        _mask_buffer, _mask_dtype = col.get_buffers()["validity"]
+        bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype)
+        series[bitmask==null_value] = None
+
+    return series
 
-def _copy_buffer_to_gpu(_buffer, _dtype):
+def _gpu_buffer_to_cupy(_buffer, _dtype):
+    _k = _DtypeKind
+    print(f'buffer dtype: {_dtype[0]}')
+    if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
+        x = cp.fromDlpack(_buffer.__dlpack__())
+    elif _dtype[0] == _k.BOOL: 
+        x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_)
+    else:
+        raise NotImplementedError(f"Data type {_dtype[0]} not handled yet")
+    return x
+
+def _cpu_buffer_to_cupy(_buffer, _dtype):
     # Handle the dtype
     kind = _dtype[0]
     bitwidth = _dtype[1]
@@ -175,7 +161,7 @@ def _copy_buffer_to_gpu(_buffer, _dtype):
     #       buffer! (not done yet, this is pretty awful ...)
     x = np.ctypeslib.as_array(data_pointer,
                               shape=(_buffer.bufsize // (bitwidth//8),))
-    return cp.array(x, dtype=column_dtype)
+    return cp.asarray(x, dtype=column_dtype)
 
 
 def convert_categorical_column(col : ColumnObject, copy:bool=False) :
@@ -200,12 +186,12 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     return set_missing_values(col, cat), codes_buffer
 
 
-def __dataframe__(self, nan_as_null : bool = False) -> dict:
+def __dataframe__(self, nan_as_null : bool = False,
+                  allow_copy : bool = True) -> dict:
     """
-    , target_device:str = 'gpu'
-    The public method to attach to cudf.DataFrame
+    The public method to attach to cudf.DataFrame.
 
-    We'll attach it via monkeypatching here for demo purposes. If Pandas adopt
+    We'll attach it via monkey-patching here for demo purposes. If Pandas adopts
     the protocol, this will be a regular method on pandas.DataFrame.
 
     ``nan_as_null`` is a keyword intended for the consumer to tell the
@@ -213,20 +199,15 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
     This currently has no effect; once support for nullable extension
     dtypes is added, this value should be propagated to columns.
 
-    ``target_device`` specifies the device where the returned dataframe protocol
-    object will live. Only `cpu` and `gpu` are supported for now.
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
+    Currently, if the flag is set to ``False`` and a copy is needed, a
+    ``RuntimeError`` will be raised.
     """
-    # if target_device not in ['cpu', 'gpu']:
-    #     raise TypeError (f'Device {device} support not handle.')
-
-    # if device == 'cpu':
-    #     raise TypeError("This operation will copy data from GPU to CPU. Set `copy=True` to allow it.")
-
-
-    return _CuDFDataFrame(self, nan_as_null=nan_as_null)
-
-# Monkeypatch the Pandas DataFrame class to support the interchange protocol
-# cudf.DataFrame.__dataframe__ = __dataframe__
+    return _CuDFDataFrame(
+        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
 
 
 # Implementation of interchange protocol
@@ -401,7 +382,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
                      "M": _k.DATETIME, "m": _k.DATETIME}
         kind = _np_kinds.get(dtype.kind, None)
         if kind is None:
-            # Not a NumPy dtype. Check if it's a categorical maybe
+            # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
             if isinstance(dtype, cudf.CategoricalDtype):
                 kind = 23
                 # Codes and categorical values dtypes are different.
@@ -559,19 +540,19 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
         invalid = self.describe_null[1]
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT):
             buffer = _CuDFBuffer(
-                cp.array(self._col.fillna(invalid).to_gpu_array(), copy=False),
+                cp.asarray(self._col.fillna(invalid).to_gpu_array()),
                 allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.BOOL:
             # convert bool to uint8 as dlpack does not support bool natively.
             buffer = _CuDFBuffer(
-                cp.array(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8, copy=False),
+                cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8),
                 allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.cat.codes
             buffer = _CuDFBuffer(
-                cp.array(codes.fillna(invalid), copy=False),
+                cp.asarray(codes.fillna(invalid)),
                 allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(codes.dtype)
         # elif self.dtype[0] == _k.STRING:
@@ -624,7 +605,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
-            bitmask = self._unpackbits(cp.array(self._col._column.mask, copy=False), bitorder='little')[:len(self._col)]
+            bitmask = self._unpackbits(cp.asarray(self._col._column.mask), bitorder='little')[:len(self._col)]
             buffer = _CuDFBuffer(bitmask)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index f214807d75b..b9eae721353 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -138,11 +138,6 @@ def test_NA_categorical_dtype():
     _test_from_dataframe_equals(df.__dataframe__(), copy=True)
     _test_from_dataframe_equals(df.__dataframe__(), copy=False)
 
-    # df2 = _from_dataframe(df.__dataframe__())
-    # assert_dataframe_equal(df.__dataframe__(), df)
-    # tm.assert_frame_equal(df, df2)
-
-
 def test_bool_dtype():
     data_bool = dict(a=[True, True, False], b=[False, True, False])
     _test_datatype(data_bool)

From dfa02a2e9a40fbc032171a0410147868a0b79b6f Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 14 Sep 2021 14:54:16 +0000
Subject: [PATCH 11/60] change 'copy' to 'allow_copy' code wide

---
 python/cudf/cudf/core/dataframe.py         | 10 +++--
 python/cudf/cudf/core/df_protocol.py       | 24 ++++++-----
 python/cudf/cudf/tests/test_df_protocol.py | 49 +++++++++++++---------
 3 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 38b2141b987..fca86e788a0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7412,11 +7412,13 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
-    def __dataframe__(self, nan_as_null : bool = False):
-        return df_protocol.__dataframe__(self, nan_as_null=nan_as_null)
+    def __dataframe__(self, nan_as_null : bool = False,
+                      allow_copy : bool = True):
+        return df_protocol.__dataframe__(self, nan_as_null=nan_as_null,
+                                         allow_copy=allow_copy)
     
-def from_dataframe(df, copy = False):
-    return df_protocol.from_dataframe(df, copy=copy)
+def from_dataframe(df, allow_copy = False):
+    return df_protocol.from_dataframe(df, allow_copy=allow_copy)
 
 def from_pandas(obj, nan_as_null=None):
     """
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 9aeb7188c9a..3a5b70e72df 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -51,9 +51,9 @@ def from_dataframe(df : DataFrameObject, allow_copy: bool = False) :
     return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
-def _from_dataframe(df : DataFrameObject, copy: bool = False) :
+def _from_dataframe(df : DataFrameObject) :
     """
-    Create a cudf DataFrame object from DataFrameObject Interface.
+    Create a cudf DataFrame object from DataFrameObject.
     """
     # Check number of chunks, if there's more than one we need to iterate
     if df.num_chunks() > 1:
@@ -68,9 +68,9 @@ def _from_dataframe(df : DataFrameObject, copy: bool = False) :
         col = df.get_column_by_name(name)
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
             # Simple numerical or bool dtype, turn into numpy array
-            columns[name], _buf = convert_column_to_cupy_ndarray(col, copy=copy)
+            columns[name], _buf = convert_column_to_cupy_ndarray(col, allow_copy=col._allow_copy)
         elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name], _buf = convert_categorical_column(col, copy=copy)
+            columns[name], _buf = convert_categorical_column(col, allow_copy=col._allow_copy)
         else:
             raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
         
@@ -92,7 +92,7 @@ class _DtypeKind(enum.IntEnum):
     CATEGORICAL = 23
 
 
-def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> np.ndarray:
+def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> cp.ndarray:
     """
     Convert an int, uint, float or bool column to a numpy array
     """
@@ -100,17 +100,18 @@ def convert_column_to_cupy_ndarray(col : ColumnObject, copy : bool = False) -> n
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _buffer, _dtype = col.get_buffers()['data']
-    x = buffer_to_cupy_ndarray(_buffer, _dtype, copy=copy)
+    x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy)
 
     return set_missing_values(col, x), _buffer
 
 
-def buffer_to_cupy_ndarray(_buffer, _dtype, copy : bool = False) -> cp.ndarray:
+def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
         x = _gpu_buffer_to_cupy(_buffer, _dtype)
     else:
-        if not copy:
-            raise TypeError("This operation must copy data from CPU to GPU. Set `copy=True` to allow it.")
+        if not allow_copy:
+            raise TypeError("This operation must copy data from CPU to GPU."
+                            "Set `allow_copy=True` to allow it.")
         x = _cpu_buffer_to_cupy(_buffer, _dtype)
 
     return x
@@ -164,7 +165,7 @@ def _cpu_buffer_to_cupy(_buffer, _dtype):
     return cp.asarray(x, dtype=column_dtype)
 
 
-def convert_categorical_column(col : ColumnObject, copy:bool=False) :
+def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) :
     """
     Convert a categorical column to a Series instance
     """
@@ -177,7 +178,8 @@ def convert_categorical_column(col : ColumnObject, copy:bool=False) :
     #    codes = col._col.values.codes
     categories = cp.asarray(list(mapping.values()))
     codes_buffer, codes_dtype = col.get_buffers()['data']
-    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, copy=copy)
+    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, 
+                                   allow_copy=allow_copy)
     values = categories[codes]
 
     # Seems like cudf can only construct with non-null values, so need to
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index b9eae721353..f89ddeeb0e3 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -38,6 +38,7 @@ def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series):
     assert col.num_chunks() == 1
     if col.null_count == 0 :
         pytest.raises(RuntimeError, col._get_validity_buffer)
+        assert col.get_buffers()['validity'] == None
     assert_buffer_equal(col._get_data_buffer(), cudfcol)
     null_kind, null_value = col.describe_null
     if col.null_count == 0:
@@ -57,8 +58,8 @@ def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame):
         assert_column_equal(dfo.get_column_by_name(col), df[col])
 
 
-def _test_from_dataframe_equals(dfobj, copy=False):
-    df2 = _from_dataframe(dfobj, copy=copy)
+def _test_from_dataframe_equals(dfobj):
+    df2 = _from_dataframe(dfobj)
 
     assert_dataframe_equal(dfobj, df2)
     if isinstance(dfobj._df, cudf.DataFrame):
@@ -72,23 +73,26 @@ def _test_from_dataframe_equals(dfobj, copy=False):
 
 
 def _test_from_dataframe_exception(dfobj):
-    exception_msg = "This operation must copy data from CPU to GPU. Set `copy=True` to allow it."
+    exception_msg = "This operation must copy data from CPU to GPU. Set `allow_copy=True` to allow it."
     with pytest.raises(TypeError, match=exception_msg):
-        df2 = _from_dataframe(dfobj, copy=False)
+        df2 = _from_dataframe(dfobj)
 
 def _test_datatype(data):
     cdf = cudf.DataFrame(data=data)
-    cdfobj = cdf.__dataframe__()
-    print(cdfobj)
-    _test_from_dataframe_equals(cdfobj, copy=False)
-    _test_from_dataframe_equals(cdfobj, copy=True)
+    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
 
     # pdf = pd.DataFrame(data=data)
     # cpu_dfobj = _CuDFDataFrame(pdf)
     # _test_from_dataframe_exception(cpu_dfobj)
-    # _test_from_dataframe_equals(cpu_dfobj, copy=True)
+    # _test_from_dataframe_equals(cpu_dfobj, allow_copy=True)
     
 
+def test_from_dataframe():
+    data = dict(a=[1, 2, 3], b=[9, 10, 11])
+    df1 = cudf.DataFrame(data=data)
+    df2 = cudf.from_dataframe(df1)
+    assert_eq(df1, df2)
     
 def test_int_dtype():
     data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
@@ -98,18 +102,24 @@ def test_float_dtype():
     data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
     _test_datatype(data_float)
 
-def test_mixed_intfloat_dtype():
-    data_intfloat = dict(a=[1, 2, 3], b=[1.5, 2.5, 3.5])
-    _test_datatype(data_intfloat)
-
 def test_categorical_dtype():
     cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
     cdf["A"] = cdf["A"].astype("category")
     col = cdf.__dataframe__().get_column_by_name('A')
     assert col.dtype[0] == _DtypeKind.CATEGORICAL
     assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
-    _test_from_dataframe_equals(cdf.__dataframe__(), copy=False)
-    _test_from_dataframe_equals(cdf.__dataframe__(), copy=True)
+    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+def test_bool_dtype():
+    data_bool = dict(a=[True, True, False], b=[False, True, False])
+    _test_datatype(data_bool)
+
+def test_mixed_dtype():
+    data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5],
+                        bool=[True, False, True], categorical=[5, 1, 5])
+    _test_datatype(data_mixed)
+
 
 def test_NA_int_dtype():
     data_int = dict(a=[1, None, 3, None, 5], 
@@ -135,12 +145,11 @@ def test_NA_categorical_dtype():
     assert col.describe_null == (3, 0)  # sentinel value -1
     assert col.num_chunks() == 1
     assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
-    _test_from_dataframe_equals(df.__dataframe__(), copy=True)
-    _test_from_dataframe_equals(df.__dataframe__(), copy=False)
+    _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
 
-def test_bool_dtype():
-    data_bool = dict(a=[True, True, False], b=[False, True, False])
-    _test_datatype(data_bool)
 
 def test_NA_bool_dtype():
     data_bool = dict(a=[None, True, False], b=[False, None, None])

From d0cd04c5b3f4a72e6239a1f8a57655fcbb006a2a Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 22 Sep 2021 03:19:59 +0000
Subject: [PATCH 12/60] make "from_dataframe" accessible cudf:
 cudf.from_dataframe

---
 python/cudf/cudf/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 6b5e5b858f0..1d35682ae82 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -42,7 +42,7 @@
     UInt64Index,
     interval_range,
 )
-from cudf.core.dataframe import DataFrame, from_pandas, merge
+from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe
 from cudf.core.series import Series
 from cudf.core.multiindex import MultiIndex
 from cudf.core.cut import cut

From 2e85f5d0cca987dc9d539f712bcb1a0989e301bd Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 30 Sep 2021 01:17:19 +0000
Subject: [PATCH 13/60] minor corrections + remove 'unpackbits' in favor of
 cudf's own function

---
 python/cudf/cudf/core/df_protocol.py | 77 +++++++++++-----------------
 1 file changed, 29 insertions(+), 48 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index c6afb505623..3b1023f67e9 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -129,7 +129,6 @@ def set_missing_values(col, col_array):
 
 def _gpu_buffer_to_cupy(_buffer, _dtype):
     _k = _DtypeKind
-    print(f'buffer dtype: {_dtype[0]}')
     if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
         x = cp.fromDlpack(_buffer.__dlpack__())
     elif _dtype[0] == _k.BOOL: 
@@ -242,7 +241,7 @@ def bufsize(self) -> int:
         """
         Buffer size in bytes.
         """
-        return self._x.data.mem.size
+        return self._x.size * self._x.dtype.itemsize
 
     @property
     def ptr(self) -> int:
@@ -577,25 +576,6 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
 
         return buffer, dtype
 
-    def _unpackbits(self, myarray, bitorder="big"):
-    
-        bitorder_op = {"big": '(myarray[i / 8] >> (7 - i % 8)) & 1;', 
-                        "little": '(myarray[i / 8] >> (i % 8)) & 1;'}
-        operation = bitorder_op.get(bitorder, None)
-        if operation == None:
-            raise KeyError(f"bitorder must be either 'big' or 'little' not '{bitorder}'")
-        _unpackbits_kernel = _core.ElementwiseKernel(
-        'raw uint8 myarray', 'T unpacked',
-        'unpacked = '+ operation,
-        'unpackbits_kernel'
-        )
-
-        if myarray.dtype != cp.uint8:
-            raise TypeError('Expected an input array of unsigned byte data type')
-
-        unpacked = cp.ndarray((myarray.size * 8), dtype=cp.uint8)
-        return _unpackbits_kernel(myarray, unpacked)
-
     def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         """
         Return the buffer containing the mask values indicating missing data and
@@ -607,7 +587,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
-            bitmask = self._unpackbits(cp.asarray(self._col._column.mask), bitorder='little')[:len(self._col)]
+            bitmask = cp.asarray(self._col._column._get_mask_as_column().to_gpu_array(), dtype=cp.uint8)
             buffer = _CuDFBuffer(bitmask)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
@@ -629,32 +609,33 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         Raises RuntimeError if the data buffer does not have an associated
         offsets buffer.
         """
-        _k = _DtypeKind
-        if self.dtype[0] == _k.STRING:
-            # For each string, we need to manually determine the next offset
-            values = self._col.to_numpy()
-            ptr = 0
-            offsets = [ptr]
-            for v in values:
-                # For missing values (in this case, `np.nan` values), we don't increment the pointer)
-                if type(v) == str:
-                    b = v.encode(encoding="utf-8")
-                    ptr += len(b)
-
-                offsets.append(ptr)
-
-            # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter)
-            buf = cp.asarray(offsets, dtype="int64")
-
-            # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store
-            buffer = _CuDFBuffer(buf)
-
-            # Assemble the buffer dtype info
-            dtype = (_k.INT, 64, 'l', "=")  # note: currently only support native endianness
-        else:
-            raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
+        # _k = _DtypeKind
+        # if self.dtype[0] == _k.STRING:
+        #     # For each string, we need to manually determine the next offset
+        #     values = self._col.to_numpy()
+        #     ptr = 0
+        #     offsets = [ptr]
+        #     for v in values:
+        #         # For missing values (in this case, `np.nan` values), we don't increment the pointer)
+        #         if type(v) == str:
+        #             b = v.encode(encoding="utf-8")
+        #             ptr += len(b)
 
-        return buffer, dtype
+        #         offsets.append(ptr)
+
+        #     # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter)
+        #     buf = cp.asarray(offsets, dtype="int64")
+
+        #     # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store
+        #     buffer = _CuDFBuffer(buf)
+
+        #     # Assemble the buffer dtype info
+        #     dtype = (_k.INT, 64, 'l', "=")  # note: currently only support native endianness
+        # else:
+        #     raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
+
+        # return buffer, dtype
+        pass
 
 class _CuDFDataFrame:
     """
@@ -720,7 +701,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame':
             raise ValueError("`names` is not a sequence")
 
         return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null,
-                                self.allow_copy)
+                                self._allow_copy)
 
     def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']:
         """

From 9ed11a3714d4408443a9fcd05d5549fdfe7a6b26 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 1 Oct 2021 22:17:58 +0000
Subject: [PATCH 14/60] use cudf Column object as _CuDFColumn's  _col attribute
 instead of cudf Series object

---
 python/cudf/cudf/core/df_protocol.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 3b1023f67e9..26dd07c1fd4 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -24,12 +24,9 @@
 from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
 
 import cudf
+from cudf.core.column import as_column
 import numpy as np
 import cupy as cp
-from cupy import _core
-import pandas._testing as tm
-import cudf.testing as testcase
-import pytest
 
 
 # A typing protocol could be added later to let Mypy validate code using
@@ -304,7 +301,7 @@ def __init__(self, column,
                                       "yet".format(type(column)))
 
         # Store the column as a private attribute
-        self._col = column
+        self._col = as_column(column)
         self._nan_as_null = nan_as_null
         self._allow_copy = allow_copy
 
@@ -386,9 +383,9 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
             # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
             if isinstance(dtype, cudf.CategoricalDtype):
                 kind = _k.CATEGORICAL
-                # Codes and categorical values dtypes are different.
+                # Codes and categories' dtypes are different.
                 # We use codes' dtype as these are stored in the buffer. 
-                dtype = self._col.cat.codes.dtype
+                dtype = self._col.codes.dtype
             else:
                 raise ValueError(f"Data type {dtype} not supported by exchange"
                                  "protocol")
@@ -428,9 +425,9 @@ def describe_categorical(self) -> Tuple[Any, bool, Dict[int, Any]]:
         is_dictionary = True
         # NOTE: this shows the children approach is better, transforming
         # `categories` to a "mapping" dict is inefficient
-        codes = self._col.cat.codes  # ndarray, length `self.size`
+        codes = self._col.codes  # ndarray, length `self.size`
         # categories.values is ndarray of length n_categories
-        categories = self._col.cat.categories
+        categories = self._col.categories
         mapping = {ix: val for ix, val in enumerate(categories.values_host)}
         return ordered, is_dictionary, mapping
 
@@ -551,7 +548,7 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
                 allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
-            codes = self._col.cat.codes
+            codes = self._col.codes
             buffer = _CuDFBuffer(
                 cp.asarray(codes.fillna(invalid)),
                 allow_copy=self._allow_copy)
@@ -587,7 +584,7 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
-            bitmask = cp.asarray(self._col._column._get_mask_as_column().to_gpu_array(), dtype=cp.uint8)
+            bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8)
             buffer = _CuDFBuffer(bitmask)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype

From ca9686d12a0fa68a6711f316cd202635e287e89a Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 6 Oct 2021 06:47:07 +0000
Subject: [PATCH 15/60] use cudf buffer in _CuDFBuffer class

---
 python/cudf/cudf/core/df_protocol.py | 84 ++++++++++++++++++----------
 1 file changed, 55 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 26dd07c1fd4..bc1f905f13f 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -24,7 +24,8 @@
 from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
 
 import cudf
-from cudf.core.column import as_column
+from cudf.core.column import as_column, build_column, build_categorical_column
+from cudf.core.buffer import Buffer
 import numpy as np
 import cupy as cp
 
@@ -96,10 +97,20 @@ def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) ->
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
-    _buffer, _dtype = col.get_buffers()['data']
-    x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy)
+    _dbuffer, _ddtype = col.get_buffers()['data']
+    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype))
+    null_kind, null_value = col.describe_null
+    if null_kind != 0:
+        _vbuffer, _vdtype = col.get_buffers()['validity']
+        valid_mask = cp.asarray(Buffer(_vbuffer.ptr, _vbuffer.bufsize), cp.bool8)
+        dcol[~valid_mask] = None
+        
+    return dcol, _dbuffer
+                #  Buffer(_vbuffer.ptr, _vbuffer.bufsize)if _vbuffer != None else None)
+    # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy)
+    # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy)
 
-    return set_missing_values(col, x), _buffer
+    # return set_missing_values(col, x), _buffer
 
 
 def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.ndarray:
@@ -134,20 +145,25 @@ def _gpu_buffer_to_cupy(_buffer, _dtype):
         raise NotImplementedError(f"Data type {_dtype[0]} not handled yet")
     return x
 
-def _cpu_buffer_to_cupy(_buffer, _dtype):
-    # Handle the dtype
+def protocol_dtype_to_np_dtype(_dtype):
+    print(_dtype)
     kind = _dtype[0]
     bitwidth = _dtype[1]
     _k = _DtypeKind
-    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-        raise RuntimeError("Not a boolean, integer or floating-point dtype")
+    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL,
+                         _k.STRING, _k.DATETIME):
+        raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
 
     _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}
     _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}
     _floats = {32: np.float32, 64: np.float64}
     _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
-    column_dtype = _np_dtypes[kind][bitwidth]
+    return _np_dtypes[kind][bitwidth]
 
+def _cpu_buffer_to_cupy(_buffer, _dtype):
+    # Handle the dtype
+   
+    column_dtype = protocol_dtype_to_np_dtype(_dtype)
     # No DLPack yet, so need to construct a new ndarray from the data pointer
     # and size in the buffer plus the dtype on the column
     ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
@@ -216,18 +232,18 @@ class _CuDFBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, x : cp.ndarray, allow_copy : bool = True) -> None:
+    def __init__(self, x : Buffer, allow_copy : bool = True) -> None:
         """
-        Handle only regular columns (= numpy arrays) for now.
+        Use cudf Buffer object.
         """
-        if not x.strides == (x.dtype.itemsize,):
-            # The protocol does not support strided buffers, so a copy is
-            # necessary. If that's not allowed, we need to raise an exception.
-            if allow_copy:
-                x = x.copy()
-            else:
-                raise RuntimeError("Exports cannot be zero-copy in the case "
-                                   "of a non-contiguous buffer")
+        # if not x.strides == (x.dtype.itemsize,):
+        #     # The protocol does not support strided buffers, so a copy is
+        #     # necessary. If that's not allowed, we need to raise an exception.
+        #     if allow_copy:
+        #         x = x.copy()
+        #     else:
+        #         raise RuntimeError("Exports cannot be zero-copy in the case "
+        #                            "of a non-contiguous buffer")
 
         # Store the numpy array in which the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
@@ -238,21 +254,24 @@ def bufsize(self) -> int:
         """
         Buffer size in bytes.
         """
-        return self._x.size * self._x.dtype.itemsize
+        return self._x.nbytes
+        # return self._x.size * self._x.dtype.itemsize
 
     @property
     def ptr(self) -> int:
         """
         Pointer to start of the buffer as an integer.
         """
-        return self._x.__cuda_array_interface__['data'][0]
-
+        return self._x.ptr
+        # return self._x.__cuda_array_interface__['data'][0]
+        
     def __dlpack__(self):
         """
         DLPack not implemented in NumPy yet, so leave it out here.
         """
         try: 
-            res = self._x.toDlpack()
+            # res = self._x.toDlpack()
+            res = cp.asarray(self._x).toDlpack()
         except ValueError:
             raise TypeError(f'dtype {self._x.dtype} unsupported by `dlpack`')
 
@@ -265,7 +284,7 @@ def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
         class Device(enum.IntEnum):
              CUDA = 2
 
-        return (Device.CUDA, self._x.device.id)
+        return (Device.CUDA, cp.asarray(self._x).device.id)
 
     def __repr__(self) -> str:
         return 'CuDFBuffer(' + str({'bufsize': self.bufsize,
@@ -538,19 +557,22 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
         invalid = self.describe_null[1]
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT):
             buffer = _CuDFBuffer(
-                cp.asarray(self._col.fillna(invalid).to_gpu_array()),
+                self._col.data,
+                # cp.asarray(self._col.fillna(invalid).to_gpu_array()),
                 allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.BOOL:
             # convert bool to uint8 as dlpack does not support bool natively.
             buffer = _CuDFBuffer(
-                cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8),
+                self._col.data,
+                # cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8),
                 allow_copy=self._allow_copy)
             dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.codes
             buffer = _CuDFBuffer(
-                cp.asarray(codes.fillna(invalid)),
+                self._col.codes.data,
+                # cp.asarray(codes.fillna(invalid)),
                 allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(codes.dtype)
         # elif self.dtype[0] == _k.STRING:
@@ -584,8 +606,12 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
-            bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8)
-            buffer = _CuDFBuffer(bitmask)
+            # bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8)
+            # buffer = _CuDFBuffer(bitmask)
+            if self.dtype[0] == _k.CATEGORICAL:
+                buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data)
+            else:
+                buffer = _CuDFBuffer(self._col._get_mask_as_column().data)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
 

From a3e65e47906d1f5897741b013f05b5b482b73c9b Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 7 Oct 2021 10:44:29 +0000
Subject: [PATCH 16/60] use buffer protocol instead of dlpack protocol as the
 latter doesn't work now

---
 python/cudf/cudf/core/df_protocol.py | 56 ++++++++--------------------
 1 file changed, 15 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index bc1f905f13f..4f13f5c7738 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -98,22 +98,11 @@ def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) ->
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _dbuffer, _ddtype = col.get_buffers()['data']
-    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype))
-    null_kind, null_value = col.describe_null
-    if null_kind != 0:
-        _vbuffer, _vdtype = col.get_buffers()['validity']
-        valid_mask = cp.asarray(Buffer(_vbuffer.ptr, _vbuffer.bufsize), cp.bool8)
-        dcol[~valid_mask] = None
-        
-    return dcol, _dbuffer
-                #  Buffer(_vbuffer.ptr, _vbuffer.bufsize)if _vbuffer != None else None)
-    # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy)
-    # x = buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy=allow_copy)
+    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype))        
+    return set_missing_values(col, dcol), _dbuffer
 
-    # return set_missing_values(col, x), _buffer
 
-
-def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.ndarray:
+def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = True) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
         x = _gpu_buffer_to_cupy(_buffer, _dtype)
     else:
@@ -124,16 +113,15 @@ def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = False) -> cp.nda
 
     return x
 
-def set_missing_values(col, col_array):
-    series = cudf.Series(col_array)
+def set_missing_values(col, dcol):
     null_kind, null_value = col.describe_null
     if  null_kind != 0:
         assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
         _mask_buffer, _mask_dtype = col.get_buffers()["validity"]
-        bitmask = buffer_to_cupy_ndarray(_mask_buffer, _mask_dtype)
-        series[bitmask==null_value] = None
+        bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
+        dcol[~bitmask] = None
 
-    return series
+    return dcol
 
 def _gpu_buffer_to_cupy(_buffer, _dtype):
     _k = _DtypeKind
@@ -146,7 +134,6 @@ def _gpu_buffer_to_cupy(_buffer, _dtype):
     return x
 
 def protocol_dtype_to_np_dtype(_dtype):
-    print(_dtype)
     kind = _dtype[0]
     bitwidth = _dtype[1]
     _k = _DtypeKind
@@ -185,19 +172,15 @@ def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) :
     if not is_dict:
         raise NotImplementedError('Non-dictionary categoricals not supported yet')
 
-    # If you want to cheat for testing (can't use `_col` in real-world code):
-    #    categories = col._col.values.categories.values
-    #    codes = col._col.values.codes
-    categories = cp.asarray(list(mapping.values()))
+    categories = as_column(mapping.values())
     codes_buffer, codes_dtype = col.get_buffers()['data']
-    codes = buffer_to_cupy_ndarray(codes_buffer, codes_dtype, 
-                                   allow_copy=allow_copy)
-    values = categories[codes]
+    cdtype = protocol_dtype_to_np_dtype(codes_dtype)
+    codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
+    
+    col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
+                                    size=codes.size,ordered=ordered)
 
-    # Seems like cudf can only construct with non-null values, so need to
-    # null out the nulls later
-    cat = cudf.CategoricalIndex(values, categories=categories, ordered=ordered)
-    return set_missing_values(col, cat), codes_buffer
+    return set_missing_values(col, col1), codes_buffer
 
 
 def __dataframe__(self, nan_as_null : bool = False,
@@ -236,16 +219,7 @@ def __init__(self, x : Buffer, allow_copy : bool = True) -> None:
         """
         Use cudf Buffer object.
         """
-        # if not x.strides == (x.dtype.itemsize,):
-        #     # The protocol does not support strided buffers, so a copy is
-        #     # necessary. If that's not allowed, we need to raise an exception.
-        #     if allow_copy:
-        #         x = x.copy()
-        #     else:
-        #         raise RuntimeError("Exports cannot be zero-copy in the case "
-        #                            "of a non-contiguous buffer")
-
-        # Store the numpy array in which the data resides as a private
+        # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
         self._x = x
 

From ae5fb811b133364e59ccf20e8e73e6b73c3ccf49 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 8 Oct 2021 03:43:44 +0000
Subject: [PATCH 17/60] string support of the protocol

---
 python/cudf/cudf/core/df_protocol.py       | 156 +++++++++++----------
 python/cudf/cudf/tests/test_df_protocol.py |  26 +++-
 2 files changed, 101 insertions(+), 81 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 4f13f5c7738..1255b056108 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -64,11 +64,16 @@ def _from_dataframe(df : DataFrameObject) :
     _buffers = []  # hold on to buffers, keeps memory alive
     for name in df.column_names():
         col = df.get_column_by_name(name)
+
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            # Simple numerical or bool dtype, turn into numpy array
-            columns[name], _buf = convert_column_to_cupy_ndarray(col, allow_copy=col._allow_copy)
+            columns[name], _buf = convert_to_cudf_column(col)
+
         elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name], _buf = convert_categorical_column(col, allow_copy=col._allow_copy)
+            columns[name], _buf = convert_to_cudf_categorical(col)
+
+        elif col.dtype[0] == _k.STRING:
+            columns[name], _buf = convert_to_cudf_string(col)
+            
         else:
             raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
         
@@ -90,7 +95,7 @@ class _DtypeKind(enum.IntEnum):
     CATEGORICAL = 23
 
 
-def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) -> cp.ndarray:
+def convert_to_cudf_column(col:ColumnObject) -> cp.ndarray:
     """
     Convert an int, uint, float or bool column to a numpy array
     """
@@ -98,15 +103,22 @@ def convert_column_to_cupy_ndarray(col:ColumnObject, allow_copy:bool = False) ->
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _dbuffer, _ddtype = col.get_buffers()['data']
-    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), protocol_dtype_to_np_dtype(_ddtype))        
+    check_data_is_on_gpu(_dbuffer)
+    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
+                        protocol_dtypes_to_cupy_dtype(_ddtype))        
     return set_missing_values(col, dcol), _dbuffer
 
+def check_data_is_on_gpu(buffer):
+  
+    if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy:
+        raise TypeError("This operation must copy data from CPU to GPU."
+                            "Set `allow_copy=True` to allow it.")
 
-def buffer_to_cupy_ndarray(_buffer, _dtype, allow_copy : bool = True) -> cp.ndarray:
+def buffer_to_cupy_ndarray(_buffer, _dtype) -> cp.ndarray:
     if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
         x = _gpu_buffer_to_cupy(_buffer, _dtype)
     else:
-        if not allow_copy:
+        if not _buffer._allow_copy:
             raise TypeError("This operation must copy data from CPU to GPU."
                             "Set `allow_copy=True` to allow it.")
         x = _cpu_buffer_to_cupy(_buffer, _dtype)
@@ -133,7 +145,7 @@ def _gpu_buffer_to_cupy(_buffer, _dtype):
         raise NotImplementedError(f"Data type {_dtype[0]} not handled yet")
     return x
 
-def protocol_dtype_to_np_dtype(_dtype):
+def protocol_dtypes_to_cupy_dtype(_dtype):
     kind = _dtype[0]
     bitwidth = _dtype[1]
     _k = _DtypeKind
@@ -141,16 +153,16 @@ def protocol_dtype_to_np_dtype(_dtype):
                          _k.STRING, _k.DATETIME):
         raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
 
-    _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}
-    _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}
-    _floats = {32: np.float32, 64: np.float64}
-    _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
-    return _np_dtypes[kind][bitwidth]
+    _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
+    _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
+    _floats = {32: cp.float32, 64: cp.float64}
+    _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
+    return _cp_dtypes[kind][bitwidth]
 
 def _cpu_buffer_to_cupy(_buffer, _dtype):
     # Handle the dtype
    
-    column_dtype = protocol_dtype_to_np_dtype(_dtype)
+    column_dtype = protocol_dtypes_to_cupy_dtype(_dtype)
     # No DLPack yet, so need to construct a new ndarray from the data pointer
     # and size in the buffer plus the dtype on the column
     ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
@@ -164,7 +176,7 @@ def _cpu_buffer_to_cupy(_buffer, _dtype):
     return cp.asarray(x, dtype=column_dtype)
 
 
-def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) :
+def convert_to_cudf_categorical(col : ColumnObject) :
     """
     Convert a categorical column to a Series instance
     """
@@ -174,7 +186,8 @@ def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) :
 
     categories = as_column(mapping.values())
     codes_buffer, codes_dtype = col.get_buffers()['data']
-    cdtype = protocol_dtype_to_np_dtype(codes_dtype)
+    check_data_is_on_gpu(codes_buffer)
+    cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype)
     codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
     
     col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
@@ -183,6 +196,32 @@ def convert_categorical_column(col : ColumnObject, allow_copy:bool=False) :
     return set_missing_values(col, col1), codes_buffer
 
 
+def convert_to_cudf_string(col : ColumnObject) :
+    """
+    Convert a string ColumnObject to cudf Column object.
+    """
+    # Retrieve the data buffers
+    buffers = col.get_buffers()
+
+    # Retrieve the data buffer containing the UTF-8 code units
+    dbuffer, bdtype = buffers["data"]
+    check_data_is_on_gpu(dbuffer)
+    encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize),
+                        protocol_dtypes_to_cupy_dtype(bdtype)
+                        )
+
+    # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
+    obuffer, odtype = buffers["offsets"]
+    offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), 
+                           protocol_dtypes_to_cupy_dtype(odtype)
+                           )
+    
+    col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
+
+    return set_missing_values(col, col_str), buffers
+
+
+
 def __dataframe__(self, nan_as_null : bool = False,
                   allow_copy : bool = True) -> dict:
     """
@@ -222,6 +261,7 @@ def __init__(self, x : Buffer, allow_copy : bool = True) -> None:
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
         self._x = x
+        self._allow_copy = allow_copy
 
     @property
     def bufsize(self) -> int:
@@ -284,7 +324,7 @@ class _CuDFColumn:
 
     def __init__(self, column,
                  nan_as_null : bool = True, 
-                 allow_copy: bool = False) -> None:
+                 allow_copy: bool = True) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
@@ -529,41 +569,21 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
         """
         _k = _DtypeKind
         invalid = self.describe_null[1]
-        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT):
-            buffer = _CuDFBuffer(
-                self._col.data,
-                # cp.asarray(self._col.fillna(invalid).to_gpu_array()),
-                allow_copy=self._allow_copy)
-            dtype = self.dtype
-        elif self.dtype[0] == _k.BOOL:
-            # convert bool to uint8 as dlpack does not support bool natively.
-            buffer = _CuDFBuffer(
-                self._col.data,
-                # cp.asarray(self._col.fillna(invalid).to_gpu_array(), dtype=cp.uint8),
-                allow_copy=self._allow_copy)
+        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            buffer = _CuDFBuffer(self._col.data, allow_copy=self._allow_copy)
             dtype = self.dtype
+
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.codes
-            buffer = _CuDFBuffer(
-                self._col.codes.data,
-                # cp.asarray(codes.fillna(invalid)),
-                allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(self._col.codes.data, allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(codes.dtype)
-        # elif self.dtype[0] == _k.STRING:
-        #     # Marshal the strings from a NumPy object array into a byte array
-        #     buf = self._col.to_numpy()
-        #     b = bytearray()
-
-        #     # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
-        #     for i in range(buf.size):
-        #         if type(buf[i]) == str:
-        #             b.extend(buf[i].encode(encoding="utf-8"))
 
-        #     # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store
-        #     buffer = _CuDFBuffer(np.frombuffer(b, dtype="uint8"))
+        elif self.dtype[0] == _k.STRING:
+            encoded_string = self._col.children[1]
+            buffer = _CuDFBuffer(encoded_string.data, allow_copy=self._allow_copy)
+            dtype = self._dtype_from_cudfdtype(encoded_string.dtype) 
+            # dtype = (_k.STRING, 8, "u", "=") 
 
-        #     # Define the dtype for the returned buffer
-        #     dtype = (_k.STRING, 8, "u", "=")  # note: currently only support native endianness
         else:
             raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
 
@@ -580,12 +600,12 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
-            # bitmask = cp.asarray(self._col._get_mask_as_column().to_gpu_array(), dtype=cp.uint8)
-            # buffer = _CuDFBuffer(bitmask)
             if self.dtype[0] == _k.CATEGORICAL:
-                buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data)
+                buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, 
+                                     allow_copy=self._allow_copy)
             else:
-                buffer = _CuDFBuffer(self._col._get_mask_as_column().data)
+                buffer = _CuDFBuffer(self._col._get_mask_as_column().data, 
+                                     allow_copy=self._allow_copy)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
 
@@ -606,33 +626,15 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         Raises RuntimeError if the data buffer does not have an associated
         offsets buffer.
         """
-        # _k = _DtypeKind
-        # if self.dtype[0] == _k.STRING:
-        #     # For each string, we need to manually determine the next offset
-        #     values = self._col.to_numpy()
-        #     ptr = 0
-        #     offsets = [ptr]
-        #     for v in values:
-        #         # For missing values (in this case, `np.nan` values), we don't increment the pointer)
-        #         if type(v) == str:
-        #             b = v.encode(encoding="utf-8")
-        #             ptr += len(b)
-
-        #         offsets.append(ptr)
-
-        #     # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter)
-        #     buf = cp.asarray(offsets, dtype="int64")
-
-        #     # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store
-        #     buffer = _CuDFBuffer(buf)
-
-        #     # Assemble the buffer dtype info
-        #     dtype = (_k.INT, 64, 'l', "=")  # note: currently only support native endianness
-        # else:
-        #     raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
-
-        # return buffer, dtype
-        pass
+        _k = _DtypeKind
+        if self.dtype[0] == _k.STRING:
+            offsets = self._col.children[0]
+            buffer = _CuDFBuffer(offsets.data, allow_copy=self._allow_copy)
+            dtype = self._dtype_from_cudfdtype(offsets.dtype) 
+        else:
+            raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
+
+        return buffer, dtype
 
 class _CuDFDataFrame:
     """
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index f89ddeeb0e3..7936aa46ac5 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -115,6 +115,13 @@ def test_bool_dtype():
     data_bool = dict(a=[True, True, False], b=[False, True, False])
     _test_datatype(data_bool)
 
+
+def test_string_dtype():
+    data_string = dict(a=["a", "b", "cdef", "", "g"])
+    _test_datatype(data_string)
+   
+
+
 def test_mixed_dtype():
     data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5],
                         bool=[True, False, True], categorical=[5, 1, 5])
@@ -148,9 +155,20 @@ def test_NA_categorical_dtype():
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))
 
-
-
-
 def test_NA_bool_dtype():
     data_bool = dict(a=[None, True, False], b=[False, None, None])
-    _test_datatype(data_bool)
\ No newline at end of file
+    _test_datatype(data_bool)
+
+def test_NA_string_dtype():
+    df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]})
+    df["B"] = df["A"].astype("object")
+    df.at[1, "B"] = cudf.NA  # Set one item to null
+
+    # Test for correctness and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.STRING
+    assert col.null_count == 1
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))

From ed7130b8c044373d148d75cd9b3291a6e050ae29 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Thu, 14 Oct 2021 17:48:16 +0000
Subject: [PATCH 18/60] rename class attribute 'x' into 'buf'

---
 python/cudf/cudf/core/df_protocol.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 1255b056108..376d68e3c95 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -254,13 +254,13 @@ class _CuDFBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, x : Buffer, allow_copy : bool = True) -> None:
+    def __init__(self, buf : Buffer, allow_copy : bool = True) -> None:
         """
         Use cudf Buffer object.
         """
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
-        self._x = x
+        self._buf = buf
         self._allow_copy = allow_copy
 
     @property
@@ -268,15 +268,14 @@ def bufsize(self) -> int:
         """
         Buffer size in bytes.
         """
-        return self._x.nbytes
-        # return self._x.size * self._x.dtype.itemsize
+        return self._buf.nbytes
 
     @property
     def ptr(self) -> int:
         """
         Pointer to start of the buffer as an integer.
         """
-        return self._x.ptr
+        return self._buf.ptr
         # return self._x.__cuda_array_interface__['data'][0]
         
     def __dlpack__(self):
@@ -285,9 +284,9 @@ def __dlpack__(self):
         """
         try: 
             # res = self._x.toDlpack()
-            res = cp.asarray(self._x).toDlpack()
+            res = cp.asarray(self._buf).toDlpack()
         except ValueError:
-            raise TypeError(f'dtype {self._x.dtype} unsupported by `dlpack`')
+            raise TypeError(f'dtype {self._buf.dtype} unsupported by `dlpack`')
 
         return res
 
@@ -298,7 +297,7 @@ def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
         class Device(enum.IntEnum):
              CUDA = 2
 
-        return (Device.CUDA, cp.asarray(self._x).device.id)
+        return (Device.CUDA, cp.asarray(self._buf).device.id)
 
     def __repr__(self) -> str:
         return 'CuDFBuffer(' + str({'bufsize': self.bufsize,

From 9a6f957f2a2143af144fa04e0347593afd9bc6a1 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 15 Oct 2021 07:32:48 +0000
Subject: [PATCH 19/60] add thorough tests for on '_CuDFcolumn' and
 '_CuDFBuffer' classes

---
 python/cudf/cudf/tests/test_df_protocol.py | 44 ++++++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 7936aa46ac5..413f02b8afd 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -1,17 +1,21 @@
 import datetime
-import cupy
+import cupy as cp
 import numpy as np
 import pytest
 from cudf.core.df_protocol import (
     _from_dataframe, 
     _DtypeKind,
-    __dataframe__,
+    protocol_dtypes_to_cupy_dtype,
+
+
     _CuDFDataFrame,
     _CuDFColumn,
     _CuDFBuffer
 )
 
 import cudf
+from cudf.core.column import build_column
+from cudf.core.buffer import Buffer
 from cudf.testing import _utils as utils
 from cudf.testing._utils import (
     ALL_TYPES,
@@ -27,11 +31,20 @@
 
 DataFrameObject = Any
 
-def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol:cudf.Series):
+def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     buf, dtype = buffer_dtype
-    assert buf.__dlpack_device__() == (2, 0)
-
-def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series):
+    device_id = cp.asarray(cudfcol.data).device.id
+    assert buf.__dlpack_device__() == (2, device_id)
+    col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize),
+                        protocol_dtypes_to_cupy_dtype(dtype)
+                        )
+    # check that non null values are the equals as null are represented
+    # by sentinel values in the buffer.
+    non_null_idxs = cudfcol!=None
+    assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
+
+
+def assert_column_equal(col: _CuDFColumn, cudfcol):
     assert col.size == cudfcol.size 
     assert col.offset == 0
     assert col.null_count == cudfcol.isna().sum() 
@@ -39,7 +52,22 @@ def assert_column_equal(col: _CuDFColumn, cudfcol:cudf.Series):
     if col.null_count == 0 :
         pytest.raises(RuntimeError, col._get_validity_buffer)
         assert col.get_buffers()['validity'] == None
-    assert_buffer_equal(col._get_data_buffer(), cudfcol)
+    else:
+        assert_buffer_equal(col.get_buffers()['validity'],
+                            cudfcol._get_mask_as_column().astype(cp.uint8))
+    
+    if col.dtype[0] == _DtypeKind.CATEGORICAL:
+        assert_buffer_equal(col.get_buffers()['data'], cudfcol.codes)
+        assert col.get_buffers()['offsets'] == None
+
+    elif col.dtype[0] == _DtypeKind.STRING:
+        assert_buffer_equal(col.get_buffers()['data'], cudfcol.children[1])
+        assert_buffer_equal(col.get_buffers()['offsets'], cudfcol.children[0])
+        
+    else:
+        assert_buffer_equal(col.get_buffers()['data'], cudfcol)
+        assert col.get_buffers()['offsets'] == None
+
     null_kind, null_value = col.describe_null
     if col.null_count == 0:
         assert null_kind == 0
@@ -55,7 +83,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame):
     assert dfo.num_chunks() == 1
     assert dfo.column_names() == list(df.columns)
     for col in df.columns:
-        assert_column_equal(dfo.get_column_by_name(col), df[col])
+        assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
 
 
 def _test_from_dataframe_equals(dfobj):

From f9ca94d86e7b93e1457abc785faadc1b560c1b9e Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 15 Oct 2021 09:01:46 +0000
Subject: [PATCH 20/60] write 'dlpack' support tests

---
 python/cudf/cudf/tests/test_df_protocol.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 413f02b8afd..d5f12d76085 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -42,6 +42,12 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     # by sentinel values in the buffer.
     non_null_idxs = cudfcol!=None
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
+    array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
+    col_array = cp.asarray(cudfcol.data_array_view)
+    # non_null_idxs = (col_array!=None)
+    assert_eq(array_from_dlpack.all(), col_array.all())
+    print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}")
+
 
 
 def assert_column_equal(col: _CuDFColumn, cudfcol):

From 1709babdd0f6d5d79110d3c3668f3cb772fff26b Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 15 Oct 2021 09:23:50 +0000
Subject: [PATCH 21/60] dlpack support ok

---
 python/cudf/cudf/core/df_protocol.py       | 23 +++++++++++++---------
 python/cudf/cudf/tests/test_df_protocol.py | 14 ++++++++-----
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 376d68e3c95..de25bea4563 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -28,6 +28,7 @@
 from cudf.core.buffer import Buffer
 import numpy as np
 import cupy as cp
+from numba import cuda
 
 
 # A typing protocol could be added later to let Mypy validate code using
@@ -254,13 +255,14 @@ class _CuDFBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, buf : Buffer, allow_copy : bool = True) -> None:
+    def __init__(self, buf : Buffer, cudf_dtype, allow_copy : bool = True) -> None:
         """
         Use cudf Buffer object.
         """
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
         self._buf = buf
+        self._cudf_dtype =  cudf_dtype
         self._allow_copy = allow_copy
 
     @property
@@ -284,9 +286,10 @@ def __dlpack__(self):
         """
         try: 
             # res = self._x.toDlpack()
-            res = cp.asarray(self._buf).toDlpack()
+            cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype)
+            res = cp.asarray(cudarray).toDlpack()
         except ValueError:
-            raise TypeError(f'dtype {self._buf.dtype} unsupported by `dlpack`')
+            raise TypeError(f'dtype {self._cudf_dtype} unsupported by `dlpack`')
 
         return res
 
@@ -569,17 +572,19 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
         _k = _DtypeKind
         invalid = self.describe_null[1]
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _CuDFBuffer(self._col.data, allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(self._col.data, self._col.dtype, 
+                                 allow_copy=self._allow_copy)
             dtype = self.dtype
 
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.codes
-            buffer = _CuDFBuffer(self._col.codes.data, allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype, 
+                                 allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(codes.dtype)
 
         elif self.dtype[0] == _k.STRING:
             encoded_string = self._col.children[1]
-            buffer = _CuDFBuffer(encoded_string.data, allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(encoded_string.dtype) 
             # dtype = (_k.STRING, 8, "u", "=") 
 
@@ -600,10 +605,10 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         if null == 3:
             _k = _DtypeKind
             if self.dtype[0] == _k.CATEGORICAL:
-                buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, 
+                buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, cp.uint8, 
                                      allow_copy=self._allow_copy)
             else:
-                buffer = _CuDFBuffer(self._col._get_mask_as_column().data, 
+                buffer = _CuDFBuffer(self._col._get_mask_as_column().data, cp.uint8,
                                      allow_copy=self._allow_copy)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
@@ -628,7 +633,7 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         _k = _DtypeKind
         if self.dtype[0] == _k.STRING:
             offsets = self._col.children[0]
-            buffer = _CuDFBuffer(offsets.data, allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(offsets.data, offsets.dtype, allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(offsets.dtype) 
         else:
             raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index d5f12d76085..523fb0ce7a1 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -42,11 +42,15 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     # by sentinel values in the buffer.
     non_null_idxs = cudfcol!=None
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
-    array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
-    col_array = cp.asarray(cudfcol.data_array_view)
-    # non_null_idxs = (col_array!=None)
-    assert_eq(array_from_dlpack.all(), col_array.all())
-    print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}")
+    
+    if dtype[0] != _DtypeKind.BOOL:
+        array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
+        col_array = cp.asarray(cudfcol.data_array_view)
+        assert_eq(array_from_dlpack.all(), col_array.all())
+        print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}")
+    else:
+        pytest.raises(TypeError, buf.__dlpack__)
+
 
 
 

From 25c4474629d4ae4e147751ed1217d018c68f76f4 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Mon, 18 Oct 2021 08:36:17 +0000
Subject: [PATCH 22/60] use '_from_data' to create dataframe from columns

---
 python/cudf/cudf/core/df_protocol.py       | 4 +---
 python/cudf/cudf/tests/test_df_protocol.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index de25bea4563..905a9f8695c 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -80,7 +80,7 @@ def _from_dataframe(df : DataFrameObject) :
         
         _buffers.append(_buf)
 
-    df_new = cudf.DataFrame(columns)
+    df_new = cudf.DataFrame._from_data(columns)
     df_new._buffers = _buffers
     return df_new
 
@@ -278,14 +278,12 @@ def ptr(self) -> int:
         Pointer to start of the buffer as an integer.
         """
         return self._buf.ptr
-        # return self._x.__cuda_array_interface__['data'][0]
         
     def __dlpack__(self):
         """
         DLPack not implemented in NumPy yet, so leave it out here.
         """
         try: 
-            # res = self._x.toDlpack()
             cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype)
             res = cp.asarray(cudarray).toDlpack()
         except ValueError:
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 523fb0ce7a1..2fe1652750d 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -47,7 +47,6 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
         array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
         col_array = cp.asarray(cudfcol.data_array_view)
         assert_eq(array_from_dlpack.all(), col_array.all())
-        print(f"dlpack OK: \n{array_from_dlpack}\n{col_array}")
     else:
         pytest.raises(TypeError, buf.__dlpack__)
 

From 5f441c2767938c214b02f7d01f5ac0a794c3274c Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 20 Oct 2021 18:04:44 +0000
Subject: [PATCH 23/60] harmonize method names like 'convert_to_cudf_string'

---
 python/cudf/cudf/core/df_protocol.py | 38 +++++++++++++---------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 905a9f8695c..fb5bf8434fb 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -4,7 +4,7 @@
 Public API
 ----------
 
-from_dataframe : construct a pandas.DataFrame from an input data frame which
+from_dataframe : construct a cudf.DataFrame from an input data frame which
                  implements the exchange protocol
 
 Notes
@@ -58,8 +58,7 @@ def _from_dataframe(df : DataFrameObject) :
     if df.num_chunks() > 1:
         raise NotImplementedError
 
-    # We need a dict of columns here, with each column being a numpy array (at
-    # least for now, deal with non-numpy dtypes later).
+    # We need a dict of columns here, with each column being a cudf column column.
     columns = dict()
     _k = _DtypeKind
     _buffers = []  # hold on to buffers, keeps memory alive
@@ -67,7 +66,7 @@ def _from_dataframe(df : DataFrameObject) :
         col = df.get_column_by_name(name)
 
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            columns[name], _buf = convert_to_cudf_column(col)
+            columns[name], _buf = _protocol_column_to_cudf_column_numeric(col)
 
         elif col.dtype[0] == _k.CATEGORICAL:
             columns[name], _buf = convert_to_cudf_categorical(col)
@@ -95,22 +94,21 @@ class _DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
-
-def convert_to_cudf_column(col:ColumnObject) -> cp.ndarray:
+def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
     """
-    Convert an int, uint, float or bool column to a numpy array
+    Convert an int, uint, float or bool protocol column to the corresponding cudf column
     """
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _dbuffer, _ddtype = col.get_buffers()['data']
-    check_data_is_on_gpu(_dbuffer)
+    _check_data_is_on_gpu(_dbuffer)
     dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
                         protocol_dtypes_to_cupy_dtype(_ddtype))        
-    return set_missing_values(col, dcol), _dbuffer
+    return _set_missing_values(col, dcol), _dbuffer
+
 
-def check_data_is_on_gpu(buffer):
-  
+def _check_data_is_on_gpu(buffer):
     if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU."
                             "Set `allow_copy=True` to allow it.")
@@ -126,15 +124,15 @@ def buffer_to_cupy_ndarray(_buffer, _dtype) -> cp.ndarray:
 
     return x
 
-def set_missing_values(col, dcol):
-    null_kind, null_value = col.describe_null
+def _set_missing_values(protocol_col, cudf_col):
+    null_kind, null_value = protocol_col.describe_null
     if  null_kind != 0:
         assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
-        _mask_buffer, _mask_dtype = col.get_buffers()["validity"]
+        _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
         bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
-        dcol[~bitmask] = None
+        cudf_col[~bitmask] = None
 
-    return dcol
+    return cudf_col
 
 def _gpu_buffer_to_cupy(_buffer, _dtype):
     _k = _DtypeKind
@@ -187,14 +185,14 @@ def convert_to_cudf_categorical(col : ColumnObject) :
 
     categories = as_column(mapping.values())
     codes_buffer, codes_dtype = col.get_buffers()['data']
-    check_data_is_on_gpu(codes_buffer)
+    _check_data_is_on_gpu(codes_buffer)
     cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype)
     codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
     
     col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
                                     size=codes.size,ordered=ordered)
 
-    return set_missing_values(col, col1), codes_buffer
+    return _set_missing_values(col, col1), codes_buffer
 
 
 def convert_to_cudf_string(col : ColumnObject) :
@@ -206,7 +204,7 @@ def convert_to_cudf_string(col : ColumnObject) :
 
     # Retrieve the data buffer containing the UTF-8 code units
     dbuffer, bdtype = buffers["data"]
-    check_data_is_on_gpu(dbuffer)
+    _check_data_is_on_gpu(dbuffer)
     encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize),
                         protocol_dtypes_to_cupy_dtype(bdtype)
                         )
@@ -219,7 +217,7 @@ def convert_to_cudf_string(col : ColumnObject) :
     
     col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
 
-    return set_missing_values(col, col_str), buffers
+    return _set_missing_values(col, col_str), buffers
 
 
 

From 78741a936dc25bff42e1dd21ff39501c4a6ed7eb Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 22 Oct 2021 08:43:58 +0000
Subject: [PATCH 24/60] Do the same for 'convert_to_cudf_categorical'

---
 python/cudf/cudf/core/df_protocol.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index fb5bf8434fb..89e67d60e76 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -69,10 +69,10 @@ def _from_dataframe(df : DataFrameObject) :
             columns[name], _buf = _protocol_column_to_cudf_column_numeric(col)
 
         elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name], _buf = convert_to_cudf_categorical(col)
+            columns[name], _buf = _protocol_column_to_cudf_column_categorical(col)
 
         elif col.dtype[0] == _k.STRING:
-            columns[name], _buf = convert_to_cudf_string(col)
+            columns[name], _buf = _protocol_column_to_cudf_column_string(col)
             
         else:
             raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
@@ -175,7 +175,7 @@ def _cpu_buffer_to_cupy(_buffer, _dtype):
     return cp.asarray(x, dtype=column_dtype)
 
 
-def convert_to_cudf_categorical(col : ColumnObject) :
+def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
     """
     Convert a categorical column to a Series instance
     """
@@ -195,7 +195,7 @@ def convert_to_cudf_categorical(col : ColumnObject) :
     return _set_missing_values(col, col1), codes_buffer
 
 
-def convert_to_cudf_string(col : ColumnObject) :
+def _protocol_column_to_cudf_column_string(col : ColumnObject) :
     """
     Convert a string ColumnObject to cudf Column object.
     """

From 3cc229b322b99402b59c6d10cd3166a35fe0bc3f Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 22 Oct 2021 08:51:53 +0000
Subject: [PATCH 25/60] remove unused methods

---
 python/cudf/cudf/core/df_protocol.py | 41 ++--------------------------
 1 file changed, 3 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 89e67d60e76..49808fee17f 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -94,6 +94,9 @@ class _DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
+
+
+
 def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
     """
     Convert an int, uint, float or bool protocol column to the corresponding cudf column
@@ -113,17 +116,6 @@ def _check_data_is_on_gpu(buffer):
         raise TypeError("This operation must copy data from CPU to GPU."
                             "Set `allow_copy=True` to allow it.")
 
-def buffer_to_cupy_ndarray(_buffer, _dtype) -> cp.ndarray:
-    if _buffer.__dlpack_device__()[0] == 2: # dataframe is on GPU/CUDA
-        x = _gpu_buffer_to_cupy(_buffer, _dtype)
-    else:
-        if not _buffer._allow_copy:
-            raise TypeError("This operation must copy data from CPU to GPU."
-                            "Set `allow_copy=True` to allow it.")
-        x = _cpu_buffer_to_cupy(_buffer, _dtype)
-
-    return x
-
 def _set_missing_values(protocol_col, cudf_col):
     null_kind, null_value = protocol_col.describe_null
     if  null_kind != 0:
@@ -134,16 +126,6 @@ def _set_missing_values(protocol_col, cudf_col):
 
     return cudf_col
 
-def _gpu_buffer_to_cupy(_buffer, _dtype):
-    _k = _DtypeKind
-    if _dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL):
-        x = cp.fromDlpack(_buffer.__dlpack__())
-    elif _dtype[0] == _k.BOOL: 
-        x = cp.fromDlpack(_buffer.__dlpack__()).astype(cp.bool_)
-    else:
-        raise NotImplementedError(f"Data type {_dtype[0]} not handled yet")
-    return x
-
 def protocol_dtypes_to_cupy_dtype(_dtype):
     kind = _dtype[0]
     bitwidth = _dtype[1]
@@ -158,23 +140,6 @@ def protocol_dtypes_to_cupy_dtype(_dtype):
     _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
     return _cp_dtypes[kind][bitwidth]
 
-def _cpu_buffer_to_cupy(_buffer, _dtype):
-    # Handle the dtype
-   
-    column_dtype = protocol_dtypes_to_cupy_dtype(_dtype)
-    # No DLPack yet, so need to construct a new ndarray from the data pointer
-    # and size in the buffer plus the dtype on the column
-    ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
-    data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type))
-
-    # NOTE: `x` does not own its memory, so the caller of this function must
-    #       either make a copy or hold on to a reference of the column or
-    #       buffer! (not done yet, this is pretty awful ...)
-    x = np.ctypeslib.as_array(data_pointer,
-                              shape=(_buffer.bufsize // (bitwidth//8),))
-    return cp.asarray(x, dtype=column_dtype)
-
-
 def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
     """
     Convert a categorical column to a Series instance

From 6fa456680dbb5c531eacc73a962f407fb35dd780 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 22 Oct 2021 09:23:57 +0000
Subject: [PATCH 26/60] reorganize code so that class mentions occur  after
 their definitions

---
 python/cudf/cudf/core/df_protocol.py | 379 +++++++++++++--------------
 1 file changed, 189 insertions(+), 190 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 49808fee17f..9a4f052b598 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,23 +1,3 @@
-"""
-Implementation of the dataframe exchange protocol.
-
-Public API
-----------
-
-from_dataframe : construct a cudf.DataFrame from an input data frame which
-                 implements the exchange protocol
-
-Notes
------
-
-- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
-  do in pure Python. It's more general but definitely less friendly than having
-  ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
-  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
-  this is worth looking at again.
-
-"""
-
 import enum
 import collections
 import ctypes
@@ -31,59 +11,9 @@
 from numba import cuda
 
 
-# A typing protocol could be added later to let Mypy validate code using
-# `from_dataframe` better.
-DataFrameObject = Any
-ColumnObject = Any
-
-
-def from_dataframe(df : DataFrameObject, allow_copy: bool = False) :
-    """
-    Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
-    """
-    if isinstance(df, cudf.DataFrame):
-        return df
-
-    if not hasattr(df, '__dataframe__'):
-        raise ValueError("`df` does not support __dataframe__")
-
-    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
-
-
-def _from_dataframe(df : DataFrameObject) :
-    """
-    Create a cudf DataFrame object from DataFrameObject.
-    """
-    # Check number of chunks, if there's more than one we need to iterate
-    if df.num_chunks() > 1:
-        raise NotImplementedError
-
-    # We need a dict of columns here, with each column being a cudf column column.
-    columns = dict()
-    _k = _DtypeKind
-    _buffers = []  # hold on to buffers, keeps memory alive
-    for name in df.column_names():
-        col = df.get_column_by_name(name)
-
-        if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            columns[name], _buf = _protocol_column_to_cudf_column_numeric(col)
-
-        elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name], _buf = _protocol_column_to_cudf_column_categorical(col)
-
-        elif col.dtype[0] == _k.STRING:
-            columns[name], _buf = _protocol_column_to_cudf_column_string(col)
-            
-        else:
-            raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
-        
-        _buffers.append(_buf)
-
-    df_new = cudf.DataFrame._from_data(columns)
-    df_new._buffers = _buffers
-    return df_new
-
 
+# Implementation of interchange protocol classes
+# ----------------------------------------------
 
 class _DtypeKind(enum.IntEnum):
     INT = 0
@@ -93,125 +23,7 @@ class _DtypeKind(enum.IntEnum):
     STRING = 21   # UTF-8
     DATETIME = 22
     CATEGORICAL = 23
-
-
-
-
-def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
-    """
-    Convert an int, uint, float or bool protocol column to the corresponding cudf column
-    """
-    if col.offset != 0:
-        raise NotImplementedError("column.offset > 0 not handled yet")
-
-    _dbuffer, _ddtype = col.get_buffers()['data']
-    _check_data_is_on_gpu(_dbuffer)
-    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
-                        protocol_dtypes_to_cupy_dtype(_ddtype))        
-    return _set_missing_values(col, dcol), _dbuffer
-
-
-def _check_data_is_on_gpu(buffer):
-    if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy:
-        raise TypeError("This operation must copy data from CPU to GPU."
-                            "Set `allow_copy=True` to allow it.")
-
-def _set_missing_values(protocol_col, cudf_col):
-    null_kind, null_value = protocol_col.describe_null
-    if  null_kind != 0:
-        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
-        _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
-        bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
-        cudf_col[~bitmask] = None
-
-    return cudf_col
-
-def protocol_dtypes_to_cupy_dtype(_dtype):
-    kind = _dtype[0]
-    bitwidth = _dtype[1]
-    _k = _DtypeKind
-    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL,
-                         _k.STRING, _k.DATETIME):
-        raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
-
-    _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
-    _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
-    _floats = {32: cp.float32, 64: cp.float64}
-    _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
-    return _cp_dtypes[kind][bitwidth]
-
-def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
-    """
-    Convert a categorical column to a Series instance
-    """
-    ordered, is_dict, mapping = col.describe_categorical
-    if not is_dict:
-        raise NotImplementedError('Non-dictionary categoricals not supported yet')
-
-    categories = as_column(mapping.values())
-    codes_buffer, codes_dtype = col.get_buffers()['data']
-    _check_data_is_on_gpu(codes_buffer)
-    cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype)
-    codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
     
-    col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
-                                    size=codes.size,ordered=ordered)
-
-    return _set_missing_values(col, col1), codes_buffer
-
-
-def _protocol_column_to_cudf_column_string(col : ColumnObject) :
-    """
-    Convert a string ColumnObject to cudf Column object.
-    """
-    # Retrieve the data buffers
-    buffers = col.get_buffers()
-
-    # Retrieve the data buffer containing the UTF-8 code units
-    dbuffer, bdtype = buffers["data"]
-    _check_data_is_on_gpu(dbuffer)
-    encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize),
-                        protocol_dtypes_to_cupy_dtype(bdtype)
-                        )
-
-    # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
-    obuffer, odtype = buffers["offsets"]
-    offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), 
-                           protocol_dtypes_to_cupy_dtype(odtype)
-                           )
-    
-    col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
-
-    return _set_missing_values(col, col_str), buffers
-
-
-
-def __dataframe__(self, nan_as_null : bool = False,
-                  allow_copy : bool = True) -> dict:
-    """
-    The public method to attach to cudf.DataFrame.
-
-    We'll attach it via monkey-patching here for demo purposes. If Pandas adopts
-    the protocol, this will be a regular method on pandas.DataFrame.
-
-    ``nan_as_null`` is a keyword intended for the consumer to tell the
-    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
-    This currently has no effect; once support for nullable extension
-    dtypes is added, this value should be propagated to columns.
-
-    ``allow_copy`` is a keyword that defines whether or not the library is
-    allowed to make a copy of the data. For example, copying data would be
-    necessary if a library supports strided buffers, given that this protocol
-    specifies contiguous buffers.
-    Currently, if the flag is set to ``False`` and a copy is needed, a
-    ``RuntimeError`` will be raised.
-    """
-    return _CuDFDataFrame(
-        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
-
-
-# Implementation of interchange protocol
-# --------------------------------------
 
 class _CuDFBuffer:
     """
@@ -672,3 +484,190 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram
         Return an iterator yielding the chunks.
         """
         return (self,)
+
+
+"""
+Implementation of the dataframe exchange protocol.
+
+Public API
+----------
+
+from_dataframe : construct a cudf.DataFrame from an input data frame which
+                 implements the exchange protocol
+
+Notes
+-----
+
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
+  do in pure Python. It's more general but definitely less friendly than having
+  ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
+  this is worth looking at again.
+
+"""
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+
+
+def from_dataframe(df : DataFrameObject, allow_copy: bool = False) :
+    """
+    Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
+    """
+    if isinstance(df, cudf.DataFrame):
+        return df
+
+    if not hasattr(df, '__dataframe__'):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
+
+
+def _from_dataframe(df : DataFrameObject) :
+    """
+    Create a cudf DataFrame object from DataFrameObject.
+    """
+    # Check number of chunks, if there's more than one we need to iterate
+    if df.num_chunks() > 1:
+        raise NotImplementedError
+
+    # We need a dict of columns here, with each column being a cudf column column.
+    columns = dict()
+    _k = _DtypeKind
+    _buffers = []  # hold on to buffers, keeps memory alive
+    for name in df.column_names():
+        col = df.get_column_by_name(name)
+
+        if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            columns[name], _buf = _protocol_column_to_cudf_column_numeric(col)
+
+        elif col.dtype[0] == _k.CATEGORICAL:
+            columns[name], _buf = _protocol_column_to_cudf_column_categorical(col)
+
+        elif col.dtype[0] == _k.STRING:
+            columns[name], _buf = _protocol_column_to_cudf_column_string(col)
+            
+        else:
+            raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
+        
+        _buffers.append(_buf)
+
+    df_new = cudf.DataFrame._from_data(columns)
+    df_new._buffers = _buffers
+    return df_new
+
+
+def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
+    """
+    Convert an int, uint, float or bool protocol column to the corresponding cudf column
+    """
+    if col.offset != 0:
+        raise NotImplementedError("column.offset > 0 not handled yet")
+
+    _dbuffer, _ddtype = col.get_buffers()['data']
+    _check_data_is_on_gpu(_dbuffer)
+    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
+                        protocol_dtypes_to_cupy_dtype(_ddtype))        
+    return _set_missing_values(col, dcol), _dbuffer
+
+
+def _check_data_is_on_gpu(buffer):
+    if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy:
+        raise TypeError("This operation must copy data from CPU to GPU."
+                            "Set `allow_copy=True` to allow it.")
+
+def _set_missing_values(protocol_col, cudf_col):
+    null_kind, null_value = protocol_col.describe_null
+    if  null_kind != 0:
+        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
+        _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
+        bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
+        cudf_col[~bitmask] = None
+
+    return cudf_col
+
+def protocol_dtypes_to_cupy_dtype(_dtype):
+    kind = _dtype[0]
+    bitwidth = _dtype[1]
+    _k = _DtypeKind
+    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL,
+                         _k.STRING, _k.DATETIME):
+        raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
+
+    _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
+    _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
+    _floats = {32: cp.float32, 64: cp.float64}
+    _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
+    return _cp_dtypes[kind][bitwidth]
+
+def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
+    """
+    Convert a categorical column to a Series instance
+    """
+    ordered, is_dict, mapping = col.describe_categorical
+    if not is_dict:
+        raise NotImplementedError('Non-dictionary categoricals not supported yet')
+
+    categories = as_column(mapping.values())
+    codes_buffer, codes_dtype = col.get_buffers()['data']
+    _check_data_is_on_gpu(codes_buffer)
+    cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype)
+    codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
+    
+    col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
+                                    size=codes.size,ordered=ordered)
+
+    return _set_missing_values(col, col1), codes_buffer
+
+
+def _protocol_column_to_cudf_column_string(col : ColumnObject) :
+    """
+    Convert a string ColumnObject to cudf Column object.
+    """
+    # Retrieve the data buffers
+    buffers = col.get_buffers()
+
+    # Retrieve the data buffer containing the UTF-8 code units
+    dbuffer, bdtype = buffers["data"]
+    _check_data_is_on_gpu(dbuffer)
+    encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize),
+                        protocol_dtypes_to_cupy_dtype(bdtype)
+                        )
+
+    # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
+    obuffer, odtype = buffers["offsets"]
+    offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), 
+                           protocol_dtypes_to_cupy_dtype(odtype)
+                           )
+    
+    col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
+
+    return _set_missing_values(col, col_str), buffers
+
+
+
+def __dataframe__(self, nan_as_null : bool = False,
+                  allow_copy : bool = True) -> dict:
+    """
+    The public method to attach to cudf.DataFrame.
+
+    We'll attach it via monkey-patching here for demo purposes. If Pandas adopts
+    the protocol, this will be a regular method on pandas.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
+    Currently, if the flag is set to ``False`` and a copy is needed, a
+    ``RuntimeError`` will be raised.
+    """
+    return _CuDFDataFrame(
+        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
\ No newline at end of file

From c0f2bc347f7bcbe86395656dac76fbe3a40ac96a Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 22 Oct 2021 09:29:20 +0000
Subject: [PATCH 27/60] replace '_ints' and similar to module level constants

---
 python/cudf/cudf/core/df_protocol.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 9a4f052b598..feceae5acaf 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -513,6 +513,12 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram
 ColumnObject = Any
 
 
+_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
+_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
+_FLOATS = {32: cp.float32, 64: cp.float64}
+_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
+
+
 def from_dataframe(df : DataFrameObject, allow_copy: bool = False) :
     """
     Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
@@ -596,12 +602,8 @@ def protocol_dtypes_to_cupy_dtype(_dtype):
     if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL,
                          _k.STRING, _k.DATETIME):
         raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
-
-    _ints = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
-    _uints = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
-    _floats = {32: cp.float32, 64: cp.float64}
-    _cp_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
-    return _cp_dtypes[kind][bitwidth]
+   
+    return _CP_DTYPES[kind][bitwidth]
 
 def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
     """

From 3a3a9dc0ec394ee0dcfe4cf9f7ee47d6aab9e5a6 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 22 Oct 2021 15:07:54 +0000
Subject: [PATCH 28/60] define module level Device class and remove device
 check with number (2)

---
 python/cudf/cudf/core/df_protocol.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index feceae5acaf..1564d380108 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -12,6 +12,8 @@
 
 
 
+
+
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -23,7 +25,17 @@ class _DtypeKind(enum.IntEnum):
     STRING = 21   # UTF-8
     DATETIME = 22
     CATEGORICAL = 23
-    
+
+class Device(enum.IntEnum):
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
 
 class _CuDFBuffer:
     """
@@ -61,6 +73,7 @@ def __dlpack__(self):
         try: 
             cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype)
             res = cp.asarray(cudarray).toDlpack()
+
         except ValueError:
             raise TypeError(f'dtype {self._cudf_dtype} unsupported by `dlpack`')
 
@@ -70,9 +83,6 @@ def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
         """
         Device type and device ID for where the data in the buffer resides.
         """
-        class Device(enum.IntEnum):
-             CUDA = 2
-
         return (Device.CUDA, cp.asarray(self._buf).device.id)
 
     def __repr__(self) -> str:
@@ -485,7 +495,6 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram
         """
         return (self,)
 
-
 """
 Implementation of the dataframe exchange protocol.
 
@@ -519,7 +528,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram
 _CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
 
 
-def from_dataframe(df : DataFrameObject, allow_copy: bool = False) :
+def from_dataframe(df : DataFrameObject, allow_copy: bool = False) -> _CuDFDataFrame :
     """
     Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
     """
@@ -538,7 +547,7 @@ def _from_dataframe(df : DataFrameObject) :
     """
     # Check number of chunks, if there's more than one we need to iterate
     if df.num_chunks() > 1:
-        raise NotImplementedError
+        raise NotImplementedError("More than one chunk not handled yet")
 
     # We need a dict of columns here, with each column being a cudf column column.
     columns = dict()
@@ -581,7 +590,7 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
 
 
 def _check_data_is_on_gpu(buffer):
-    if buffer.__dlpack_device__()[0] != 2 and not buffer._allow_copy:
+    if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU."
                             "Set `allow_copy=True` to allow it.")
 
@@ -672,4 +681,5 @@ def __dataframe__(self, nan_as_null : bool = False,
     ``RuntimeError`` will be raised.
     """
     return _CuDFDataFrame(
-        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
\ No newline at end of file
+        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
+

From 7ac4f27d1b30d51fd2856dded05aec24bb6ae0fe Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Fri, 22 Oct 2021 15:30:47 +0000
Subject: [PATCH 29/60] rename methods + annotate return types

---
 python/cudf/cudf/core/df_protocol.py       | 33 ++++++++++++----------
 python/cudf/cudf/tests/test_df_protocol.py |  6 ++--
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 1564d380108..7524d5af317 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -541,7 +541,7 @@ def from_dataframe(df : DataFrameObject, allow_copy: bool = False) -> _CuDFDataF
     return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
-def _from_dataframe(df : DataFrameObject) :
+def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
     """
     Create a cudf DataFrame object from DataFrameObject.
     """
@@ -575,7 +575,8 @@ def _from_dataframe(df : DataFrameObject) :
     return df_new
 
 
-def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
+def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \
+                    Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]:
     """
     Convert an int, uint, float or bool protocol column to the corresponding cudf column
     """
@@ -585,26 +586,26 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject):
     _dbuffer, _ddtype = col.get_buffers()['data']
     _check_data_is_on_gpu(_dbuffer)
     dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
-                        protocol_dtypes_to_cupy_dtype(_ddtype))        
+                        protocol_dtype_to_cupy_dtype(_ddtype))        
     return _set_missing_values(col, dcol), _dbuffer
 
 
-def _check_data_is_on_gpu(buffer):
+def _check_data_is_on_gpu(buffer) -> None:
     if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU."
                             "Set `allow_copy=True` to allow it.")
 
-def _set_missing_values(protocol_col, cudf_col):
+def _set_missing_values(protocol_col, cudf_col) -> cudf.core.column.ColumnBase:
     null_kind, null_value = protocol_col.describe_null
     if  null_kind != 0:
-        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3 ." 
+        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." 
         _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
         bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
         cudf_col[~bitmask] = None
 
     return cudf_col
 
-def protocol_dtypes_to_cupy_dtype(_dtype):
+def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype:
     kind = _dtype[0]
     bitwidth = _dtype[1]
     _k = _DtypeKind
@@ -614,7 +615,8 @@ def protocol_dtypes_to_cupy_dtype(_dtype):
    
     return _CP_DTYPES[kind][bitwidth]
 
-def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
+def _protocol_column_to_cudf_column_categorical(col : ColumnObject) -> \
+    Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] :
     """
     Convert a categorical column to a Series instance
     """
@@ -625,16 +627,17 @@ def _protocol_column_to_cudf_column_categorical(col : ColumnObject) :
     categories = as_column(mapping.values())
     codes_buffer, codes_dtype = col.get_buffers()['data']
     _check_data_is_on_gpu(codes_buffer)
-    cdtype = protocol_dtypes_to_cupy_dtype(codes_dtype)
+    cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
     codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
     
-    col1 = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
+    cudfcol = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
                                     size=codes.size,ordered=ordered)
 
-    return _set_missing_values(col, col1), codes_buffer
+    return _set_missing_values(col, cudfcol), codes_buffer
 
 
-def _protocol_column_to_cudf_column_string(col : ColumnObject) :
+def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \
+    Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] :
     """
     Convert a string ColumnObject to cudf Column object.
     """
@@ -645,13 +648,13 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) :
     dbuffer, bdtype = buffers["data"]
     _check_data_is_on_gpu(dbuffer)
     encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize),
-                        protocol_dtypes_to_cupy_dtype(bdtype)
+                        protocol_dtype_to_cupy_dtype(bdtype)
                         )
 
     # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
     obuffer, odtype = buffers["offsets"]
     offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), 
-                           protocol_dtypes_to_cupy_dtype(odtype)
+                           protocol_dtype_to_cupy_dtype(odtype)
                            )
     
     col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
@@ -661,7 +664,7 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) :
 
 
 def __dataframe__(self, nan_as_null : bool = False,
-                  allow_copy : bool = True) -> dict:
+                  allow_copy : bool = True) -> _CuDFDataFrame:
     """
     The public method to attach to cudf.DataFrame.
 
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 2fe1652750d..3b697d2d602 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -5,9 +5,7 @@
 from cudf.core.df_protocol import (
     _from_dataframe, 
     _DtypeKind,
-    protocol_dtypes_to_cupy_dtype,
-
-
+    protocol_dtype_to_cupy_dtype,
     _CuDFDataFrame,
     _CuDFColumn,
     _CuDFBuffer
@@ -36,7 +34,7 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     device_id = cp.asarray(cudfcol.data).device.id
     assert buf.__dlpack_device__() == (2, device_id)
     col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize),
-                        protocol_dtypes_to_cupy_dtype(dtype)
+                        protocol_dtype_to_cupy_dtype(dtype)
                         )
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.

From c572c646503c437faa31cbcf2ef4277b63b3ce98 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Sat, 23 Oct 2021 11:42:43 +0000
Subject: [PATCH 30/60] add type annotations

---
 python/cudf/cudf/core/df_protocol.py | 74 +++++++++++++++-------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 7524d5af317..fe883c86be4 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -42,9 +42,10 @@ class _CuDFBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, buf : Buffer, cudf_dtype, allow_copy : bool = True) -> None:
+    def __init__(self, buf : cudf.core.buffer.Buffer, 
+                 cudf_dtype: cp.dtype, allow_copy : bool = True) -> None:
         """
-        Use cudf Buffer object.
+        Use cudf.core.buffer.Buffer object.
         """
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
@@ -66,7 +67,7 @@ def ptr(self) -> int:
         """
         return self._buf.ptr
         
-    def __dlpack__(self):
+    def __dlpack__(self) :
         """
         DLPack not implemented in NumPy yet, so leave it out here.
         """
@@ -107,7 +108,7 @@ class _CuDFColumn:
 
     """
 
-    def __init__(self, column,
+    def __init__(self, column: cudf.core.column.ColumnBase,
                  nan_as_null : bool = True, 
                  allow_copy: bool = True) -> None:
         """
@@ -217,7 +218,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
         return (kind, bitwidth, format_str, endianness)
 
     @property
-    def describe_categorical(self) -> Tuple[Any, bool, Dict[int, Any]]:
+    def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
         """
         If the dtype is categorical, there are two options:
 
@@ -313,7 +314,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']
         """
         return (self,)
 
-    def get_buffers(self) -> Dict[str, Any]:
+    def get_buffers(self) -> Dict[str, _CuDFBuffer]:
         """
         Return a dictionary containing the underlying buffers.
 
@@ -432,7 +433,8 @@ class _CuDFDataFrame:
     ``cudf.DataFrame.__dataframe__`` as objects with the methods and
     attributes defined on this class.
     """
-    def __init__(self, df, nan_as_null : bool = True,
+    def __init__(self, df : 'cudf.core.dataframe.DataFrame',
+                 nan_as_null : bool = True,
                  allow_copy : bool = True) -> None:
         """
         Constructor - an instance of this (private) class is returned from
@@ -495,6 +497,31 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFram
         """
         return (self,)
 
+
+def __dataframe__(self, nan_as_null : bool = False,
+                  allow_copy : bool = True) -> _CuDFDataFrame:
+    """
+    The public method to attach to cudf.DataFrame.
+
+    We'll attach it via monkey-patching here for demo purposes. If Pandas adopts
+    the protocol, this will be a regular method on pandas.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
+    Currently, if the flag is set to ``False`` and a copy is needed, a
+    ``RuntimeError`` will be raised.
+    """
+    return _CuDFDataFrame(
+        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
+
+
 """
 Implementation of the dataframe exchange protocol.
 
@@ -590,12 +617,15 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \
     return _set_missing_values(col, dcol), _dbuffer
 
 
-def _check_data_is_on_gpu(buffer) -> None:
+def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None:
     if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU."
                             "Set `allow_copy=True` to allow it.")
 
-def _set_missing_values(protocol_col, cudf_col) -> cudf.core.column.ColumnBase:
+def _set_missing_values(protocol_col: _CuDFColumn, 
+                        cudf_col:'cudf.core.dataframe.DataFrame') \
+                        -> cudf.core.column.ColumnBase:
+
     null_kind, null_value = protocol_col.describe_null
     if  null_kind != 0:
         assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." 
@@ -660,29 +690,3 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \
     col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
 
     return _set_missing_values(col, col_str), buffers
-
-
-
-def __dataframe__(self, nan_as_null : bool = False,
-                  allow_copy : bool = True) -> _CuDFDataFrame:
-    """
-    The public method to attach to cudf.DataFrame.
-
-    We'll attach it via monkey-patching here for demo purposes. If Pandas adopts
-    the protocol, this will be a regular method on pandas.DataFrame.
-
-    ``nan_as_null`` is a keyword intended for the consumer to tell the
-    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
-    This currently has no effect; once support for nullable extension
-    dtypes is added, this value should be propagated to columns.
-
-    ``allow_copy`` is a keyword that defines whether or not the library is
-    allowed to make a copy of the data. For example, copying data would be
-    necessary if a library supports strided buffers, given that this protocol
-    specifies contiguous buffers.
-    Currently, if the flag is set to ``False`` and a copy is needed, a
-    ``RuntimeError`` will be raised.
-    """
-    return _CuDFDataFrame(
-        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
-

From c60cf5b8e42bec051c616855dfeb6c96960b2708 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Fri, 29 Oct 2021 20:45:16 +0000
Subject: [PATCH 31/60] correct '__dlpack_device__' annotation

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index fe883c86be4..b4c103eb260 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -80,7 +80,7 @@ def __dlpack__(self) :
 
         return res
 
-    def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
+    def __dlpack_device__(self) -> Tuple[_Device, int]:
         """
         Device type and device ID for where the data in the buffer resides.
         """

From 4262fc74386aedd5e1201396d3956a02d3d99458 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Fri, 29 Oct 2021 20:46:11 +0000
Subject: [PATCH 32/60] correct 'dtype' method annotation

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index b4c103eb260..46f49e0b6e7 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -139,7 +139,7 @@ def offset(self) -> int:
         return 0
 
     @property
-    def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
+    def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
         """
         Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
 

From a7fe2876f046b2accf44b81411974c8393c1ecef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Mon, 1 Nov 2021 13:31:18 +0100
Subject: [PATCH 33/60] mark 'Device' class as private

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 46f49e0b6e7..1354c224176 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -26,7 +26,7 @@ class _DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
-class Device(enum.IntEnum):
+class _Device(enum.IntEnum):
     CPU = 1
     CUDA = 2
     CPU_PINNED = 3

From f5aef739370b69ebf4b30bdca6ce0eb998d88598 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Mon, 1 Nov 2021 13:49:58 +0100
Subject: [PATCH 34/60] Apply suggestions from @bdice code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py       | 31 ++++++++++------------
 python/cudf/cudf/tests/test_df_protocol.py |  4 +--
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 1354c224176..e6584e47326 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -43,7 +43,7 @@ class _CuDFBuffer:
     """
 
     def __init__(self, buf : cudf.core.buffer.Buffer, 
-                 cudf_dtype: cp.dtype, allow_copy : bool = True) -> None:
+                 dtype: np.dtype, allow_copy : bool = True) -> None:
         """
         Use cudf.core.buffer.Buffer object.
         """
@@ -87,7 +87,7 @@ def __dlpack_device__(self) -> Tuple[_Device, int]:
         return (Device.CUDA, cp.asarray(self._buf).device.id)
 
     def __repr__(self) -> str:
-        return 'CuDFBuffer(' + str({'bufsize': self.bufsize,
+        return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize,
                                       'ptr': self.ptr,
                                       'dlpack': self.__dlpack__(),
                                       'device': self.__dlpack_device__()[0].name}
@@ -186,7 +186,7 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
 
         return self._dtype_from_cudfdtype(dtype)
 
-    def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
+    def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
         """
         See `self.dtype` for details.
         """
@@ -214,7 +214,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
 
         bitwidth = dtype.itemsize * 8
         format_str = dtype.str
-        endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '='
+        endianness = dtype.byteorder if kind != _k.CATEGORICAL else '='
         return (kind, bitwidth, format_str, endianness)
 
     @property
@@ -225,7 +225,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
         - There are only values in the data buffer.
         - There is a separate dictionary-style encoding for categorical values.
 
-        Raises RuntimeError if the dtype is not categorical
+        Raises TypeError if the dtype is not categorical
 
         Content of returned dict:
 
@@ -269,18 +269,15 @@ def describe_null(self) -> Tuple[int, Any]:
         otherwise.
         """
         if self.null_count == 0:
-            # there is no validity mask in this case
-            # so making it non-nullable (hackingly)
-            null = 0
-            value = None
+            # there is no validity mask so it is non-nullable
+            return 0, None
         else :
             _k = _DtypeKind
             kind = self.dtype[0]
             # bit mask is universally used in cudf for missing
             if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
                         _k.BOOL, _k.STRING, _k.DATETIME):
-                null = 3
-                value = 0
+                return 3, 0
             else:
                 raise NotImplementedError(f"Data type {self.dtype} not yet supported")
 
@@ -291,7 +288,7 @@ def null_count(self) -> int:
         """
         Number of null elements. Should always be known.
         """
-        return self._col.isna().sum()
+        return self._col.null_count
 
     @property
     def metadata(self) -> Dict[str, Any]:
@@ -308,7 +305,7 @@ def num_chunks(self) -> int:
 
     def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']:
         """
-        Return an iterator yielding the chunks.
+        Return an iterable yielding the chunks.
 
         See `DataFrame.get_chunks` for details on ``n_chunks``.
         """
@@ -479,7 +476,7 @@ def get_columns(self) -> Iterable[_CuDFColumn]:
                 for name in self._df.columns]
 
     def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame':
-        if not isinstance(indices, collections.Sequence):
+        if not isinstance(indices, collections.abc.Sequence):
             raise ValueError("`indices` is not a sequence")
 
         return _CuDFDataFrame(self._df.iloc[:, indices])
@@ -576,7 +573,7 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
     if df.num_chunks() > 1:
         raise NotImplementedError("More than one chunk not handled yet")
 
-    # We need a dict of columns here, with each column being a cudf column column.
+    # We need a dict of columns here, with each column being a cudf column.
     columns = dict()
     _k = _DtypeKind
     _buffers = []  # hold on to buffers, keeps memory alive
@@ -619,7 +616,7 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \
 
 def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None:
     if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy:
-        raise TypeError("This operation must copy data from CPU to GPU."
+        raise TypeError("This operation must copy data from CPU to GPU. "
                             "Set `allow_copy=True` to allow it.")
 
 def _set_missing_values(protocol_col: _CuDFColumn, 
@@ -627,7 +624,7 @@ def _set_missing_values(protocol_col: _CuDFColumn,
                         -> cudf.core.column.ColumnBase:
 
     null_kind, null_value = protocol_col.describe_null
-    if  null_kind != 0:
+    if null_kind != 0:
         assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." 
         _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
         bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 3b697d2d602..923756e9d06 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -38,7 +38,7 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
                         )
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.
-    non_null_idxs = cudfcol!=None
+    non_null_idxs = cudfcol is not None
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
     
     if dtype[0] != _DtypeKind.BOOL:
@@ -184,7 +184,7 @@ def test_NA_categorical_dtype():
     col = df.__dataframe__().get_column_by_name('B')
     assert col.dtype[0] == _DtypeKind.CATEGORICAL
     assert col.null_count == 2
-    assert col.describe_null == (3, 0)  # sentinel value -1
+    assert col.describe_null == (3, 0)
     assert col.num_chunks() == 1
     assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))

From c47ce43390e2820a4f63cc8d8ed31db4e908e80e Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Mon, 1 Nov 2021 13:36:35 +0000
Subject: [PATCH 35/60] fix test errors due to changes + remove commented code

---
 python/cudf/cudf/core/df_protocol.py       | 33 +++++++++++-----------
 python/cudf/cudf/tests/test_df_protocol.py | 18 ++++--------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index e6584e47326..b9c3efac707 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -36,6 +36,9 @@ class _Device(enum.IntEnum):
     VPI = 9
     ROCM = 10
 
+_k = _DtypeKind
+SUPPORTED_DTYPE = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
+                   _k.BOOL, _k.STRING)
 
 class _CuDFBuffer:
     """
@@ -50,7 +53,7 @@ def __init__(self, buf : cudf.core.buffer.Buffer,
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
         self._buf = buf
-        self._cudf_dtype =  cudf_dtype
+        self._dtype =  dtype
         self._allow_copy = allow_copy
 
     @property
@@ -72,19 +75,19 @@ def __dlpack__(self) :
         DLPack not implemented in NumPy yet, so leave it out here.
         """
         try: 
-            cudarray = cuda.as_cuda_array(self._buf).view(self._cudf_dtype)
+            cudarray = cuda.as_cuda_array(self._buf).view(self._dtype)
             res = cp.asarray(cudarray).toDlpack()
 
         except ValueError:
-            raise TypeError(f'dtype {self._cudf_dtype} unsupported by `dlpack`')
+            raise TypeError(f'dtype {self._dtype} unsupported by `dlpack`')
 
         return res
 
     def __dlpack_device__(self) -> Tuple[_Device, int]:
         """
-        Device type and device ID for where the data in the buffer resides.
+        _Device type and _Device ID for where the data in the buffer resides.
         """
-        return (Device.CUDA, cp.asarray(self._buf).device.id)
+        return (_Device.CUDA, cp.asarray(self._buf).device.id)
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize,
@@ -268,20 +271,18 @@ def describe_null(self) -> Tuple[int, Any]:
         mask or a byte mask, the value (0 or 1) indicating a missing value. None
         otherwise.
         """
+        _k = _DtypeKind
+        kind = self.dtype[0]
         if self.null_count == 0:
             # there is no validity mask so it is non-nullable
             return 0, None
-        else :
-            _k = _DtypeKind
-            kind = self.dtype[0]
-            # bit mask is universally used in cudf for missing
-            if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
-                        _k.BOOL, _k.STRING, _k.DATETIME):
-                return 3, 0
-            else:
-                raise NotImplementedError(f"Data type {self.dtype} not yet supported")
 
-        return null, value
+        elif kind in SUPPORTED_DTYPE:
+            # bit mask is universally used in cudf for missing
+            return 3, 0
+            
+        else:
+            raise NotImplementedError(f"Data type {self.dtype} not yet supported")
 
     @property
     def null_count(self) -> int:
@@ -615,7 +616,7 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \
 
 
 def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None:
-    if buffer.__dlpack_device__()[0] != Device.CUDA and not buffer._allow_copy:
+    if buffer.__dlpack_device__()[0] != _Device.CUDA and not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU. "
                             "Set `allow_copy=True` to allow it.")
 
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 923756e9d06..dda608cf064 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -29,8 +29,8 @@
 
 DataFrameObject = Any
 
-def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
-    buf, dtype = buffer_dtype
+def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
+    buf, dtype = buffer_and_dtype
     device_id = cp.asarray(cudfcol.data).device.id
     assert buf.__dlpack_device__() == (2, device_id)
     col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize),
@@ -38,13 +38,14 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
                         )
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.
-    non_null_idxs = cudfcol is not None
+    non_null_idxs = cudfcol != None
+    print(non_null_idxs, cudfcol is not None)
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
     
     if dtype[0] != _DtypeKind.BOOL:
         array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
         col_array = cp.asarray(cudfcol.data_array_view)
-        assert_eq(array_from_dlpack.all(), col_array.all())
+        assert_eq(array_from_dlpack.flatten(), col_array.flatten())
     else:
         pytest.raises(TypeError, buf.__dlpack__)
 
@@ -54,7 +55,7 @@ def assert_buffer_equal(buffer_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
 def assert_column_equal(col: _CuDFColumn, cudfcol):
     assert col.size == cudfcol.size 
     assert col.offset == 0
-    assert col.null_count == cudfcol.isna().sum() 
+    assert col.null_count == cudfcol.null_count
     assert col.num_chunks() == 1
     if col.null_count == 0 :
         pytest.raises(RuntimeError, col._get_validity_buffer)
@@ -117,12 +118,6 @@ def _test_datatype(data):
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
 
-    # pdf = pd.DataFrame(data=data)
-    # cpu_dfobj = _CuDFDataFrame(pdf)
-    # _test_from_dataframe_exception(cpu_dfobj)
-    # _test_from_dataframe_equals(cpu_dfobj, allow_copy=True)
-    
-
 def test_from_dataframe():
     data = dict(a=[1, 2, 3], b=[9, 10, 11])
     df1 = cudf.DataFrame(data=data)
@@ -154,7 +149,6 @@ def test_bool_dtype():
 def test_string_dtype():
     data_string = dict(a=["a", "b", "cdef", "", "g"])
     _test_datatype(data_string)
-   
 
 
 def test_mixed_dtype():

From 139ca5adfc9004ba12a2502f4fb0f019022401dc Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Mon, 1 Nov 2021 13:42:44 +0000
Subject: [PATCH 36/60] add string column to mixed type test + a mixed type
 test case with NA

---
 python/cudf/cudf/tests/test_df_protocol.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index dda608cf064..ac99943b671 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -153,7 +153,8 @@ def test_string_dtype():
 
 def test_mixed_dtype():
     data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5],
-                        bool=[True, False, True], categorical=[5, 1, 5])
+                        bool=[True, False, True], categorical=[5, 1, 5],
+                        string=["rapidsai-cudf ", "", "df protocol"])
     _test_datatype(data_mixed)
 
 
@@ -201,3 +202,12 @@ def test_NA_string_dtype():
     assert col.num_chunks() == 1
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_mixed_dtype():
+    data_mixed = dict(int=[1, None, 2, 3, 1000], float=[None, 1.5, 2.5, 3.5, None],
+                        bool=[True, None, False, None, None], 
+                        categorical=[5, 1, 5, 3, None],
+                        string=[None, None, None, "df protocol", None])
+    _test_datatype(data_mixed)
+

From bcec52c36654879088499aec42dfbad39186df68 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Mon, 1 Nov 2021 14:55:58 +0000
Subject: [PATCH 37/60] address remaining suggestions from @bdice

---
 python/cudf/cudf/core/df_protocol.py       | 45 ++++++++++------------
 python/cudf/cudf/tests/test_df_protocol.py |  1 -
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index b9c3efac707..d0b44ea597f 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -37,7 +37,7 @@ class _Device(enum.IntEnum):
     ROCM = 10
 
 _k = _DtypeKind
-SUPPORTED_DTYPE = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
+_SUPPORTED_KINDS = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
                    _k.BOOL, _k.STRING)
 
 class _CuDFBuffer:
@@ -212,7 +212,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
                 raise ValueError(f"Data type {dtype} not supported by exchange"
                                  "protocol")
 
-        if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING):
+        if kind not in _SUPPORTED_KINDS:
             raise NotImplementedError(f"Data type {dtype} not handled yet")
 
         bitwidth = dtype.itemsize * 8
@@ -277,7 +277,7 @@ def describe_null(self) -> Tuple[int, Any]:
             # there is no validity mask so it is non-nullable
             return 0, None
 
-        elif kind in SUPPORTED_DTYPE:
+        elif kind in _SUPPORTED_KINDS:
             # bit mask is universally used in cudf for missing
             return 3, 0
             
@@ -368,7 +368,6 @@ def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype
             encoded_string = self._col.children[1]
             buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(encoded_string.dtype) 
-            # dtype = (_k.STRING, 8, "u", "=") 
 
         else:
             raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
@@ -396,13 +395,11 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
             return buffer, dtype
 
         elif null == 1:
-            msg = "This column uses NaN as null so does not have a separate mask"
+            raise RuntimeError("This column uses NaN as null so does not have a separate mask")
         elif null == 0:   
-            msg = "This column is non-nullable so does not have a mask"
+            raise RuntimeError("This column is non-nullable so does not have a mask")
         else:
-            raise NotImplementedError("See self.describe_null")
-
-        raise RuntimeError(msg)
+            raise NotImplementedError(f"See {self.__class__.__name__}.describe_null method.")
 
     def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         """
@@ -422,6 +419,7 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
 
         return buffer, dtype
 
+
 class _CuDFDataFrame:
     """
     A data frame class, with only the methods required by the interchange
@@ -501,9 +499,6 @@ def __dataframe__(self, nan_as_null : bool = False,
     """
     The public method to attach to cudf.DataFrame.
 
-    We'll attach it via monkey-patching here for demo purposes. If Pandas adopts
-    the protocol, this will be a regular method on pandas.DataFrame.
-
     ``nan_as_null`` is a keyword intended for the consumer to tell the
     producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
     This currently has no effect; once support for nullable extension
@@ -513,7 +508,7 @@ def __dataframe__(self, nan_as_null : bool = False,
     allowed to make a copy of the data. For example, copying data would be
     necessary if a library supports strided buffers, given that this protocol
     specifies contiguous buffers.
-    Currently, if the flag is set to ``False`` and a copy is needed, a
+    Currently, if this flag is set to ``False`` and a copy is needed, a
     ``RuntimeError`` will be raised.
     """
     return _CuDFDataFrame(
@@ -609,13 +604,13 @@ def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _dbuffer, _ddtype = col.get_buffers()['data']
-    _check_data_is_on_gpu(_dbuffer)
+    _check_buffer_is_on_gpu(_dbuffer)
     dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
                         protocol_dtype_to_cupy_dtype(_ddtype))        
     return _set_missing_values(col, dcol), _dbuffer
 
 
-def _check_data_is_on_gpu(buffer : _CuDFBuffer) -> None:
+def _check_buffer_is_on_gpu(buffer : _CuDFBuffer) -> None:
     if buffer.__dlpack_device__()[0] != _Device.CUDA and not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU. "
                             "Set `allow_copy=True` to allow it.")
@@ -637,8 +632,7 @@ def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype:
     kind = _dtype[0]
     bitwidth = _dtype[1]
     _k = _DtypeKind
-    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL,_k.CATEGORICAL,
-                         _k.STRING, _k.DATETIME):
+    if _dtype[0] not in _SUPPORTED_KINDS:
         raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
    
     return _CP_DTYPES[kind][bitwidth]
@@ -654,7 +648,7 @@ def _protocol_column_to_cudf_column_categorical(col : ColumnObject) -> \
 
     categories = as_column(mapping.values())
     codes_buffer, codes_dtype = col.get_buffers()['data']
-    _check_data_is_on_gpu(codes_buffer)
+    _check_buffer_is_on_gpu(codes_buffer)
     cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
     codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
     
@@ -673,16 +667,17 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \
     buffers = col.get_buffers()
 
     # Retrieve the data buffer containing the UTF-8 code units
-    dbuffer, bdtype = buffers["data"]
-    _check_data_is_on_gpu(dbuffer)
-    encoded_string = build_column(Buffer(dbuffer.ptr, dbuffer.bufsize),
-                        protocol_dtype_to_cupy_dtype(bdtype)
+    data_buffer, data_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(data_buffer)
+    encoded_string = build_column(Buffer(data_buffer.ptr, data_buffer.bufsize),
+                        protocol_dtype_to_cupy_dtype(data_dtype)
                         )
 
     # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
-    obuffer, odtype = buffers["offsets"]
-    offsets = build_column(Buffer(obuffer.ptr, obuffer.bufsize), 
-                           protocol_dtype_to_cupy_dtype(odtype)
+    offset_buffer, offset_dtype = buffers["offsets"]
+    _check_buffer_is_on_gpu(offset_buffer)
+    offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize), 
+                           protocol_dtype_to_cupy_dtype(offset_dtype)
                            )
     
     col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index ac99943b671..d66b0751780 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -39,7 +39,6 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.
     non_null_idxs = cudfcol != None
-    print(non_null_idxs, cudfcol is not None)
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
     
     if dtype[0] != _DtypeKind.BOOL:

From 3eefe0c6b6ef0e31517f40ed3e41fb9c587ac538 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Mon, 1 Nov 2021 17:29:27 +0100
Subject: [PATCH 38/60] change bare exception into RuntimeError

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index b9c3efac707..3de1fb084c2 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -337,7 +337,7 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]:
         buffers["data"] = self._get_data_buffer()
         try:
             buffers["validity"] = self._get_validity_buffer()
-        except:
+        except RuntimeError:
             buffers["validity"] = None
 
         try:

From a11ddb6ff09b23a286a8a8a702683305670b904b Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 2 Nov 2021 08:18:31 +0000
Subject: [PATCH 39/60] fix flake8 style checks

---
 python/cudf/cudf/core/df_protocol.py | 262 +++++++++++++++------------
 1 file changed, 147 insertions(+), 115 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index d0b44ea597f..2bad99bea77 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,6 +1,5 @@
 import enum
 import collections
-import ctypes
 from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
 
 import cudf
@@ -11,9 +10,6 @@
 from numba import cuda
 
 
-
-
-
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -26,6 +22,7 @@ class _DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
+
 class _Device(enum.IntEnum):
     CPU = 1
     CUDA = 2
@@ -36,16 +33,18 @@ class _Device(enum.IntEnum):
     VPI = 9
     ROCM = 10
 
+
 _k = _DtypeKind
 _SUPPORTED_KINDS = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
-                   _k.BOOL, _k.STRING)
+                    _k.BOOL, _k.STRING)
+
 
 class _CuDFBuffer:
     """
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, buf : cudf.core.buffer.Buffer, 
+    def __init__(self, buf : cudf.core.buffer.Buffer,
                  dtype: np.dtype, allow_copy : bool = True) -> None:
         """
         Use cudf.core.buffer.Buffer object.
@@ -53,7 +52,7 @@ def __init__(self, buf : cudf.core.buffer.Buffer,
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
         self._buf = buf
-        self._dtype =  dtype
+        self._dtype = dtype
         self._allow_copy = allow_copy
 
     @property
@@ -69,12 +68,12 @@ def ptr(self) -> int:
         Pointer to start of the buffer as an integer.
         """
         return self._buf.ptr
-        
+
     def __dlpack__(self) :
         """
         DLPack not implemented in NumPy yet, so leave it out here.
         """
-        try: 
+        try:
             cudarray = cuda.as_cuda_array(self._buf).view(self._dtype)
             res = cp.asarray(cudarray).toDlpack()
 
@@ -91,10 +90,14 @@ def __dlpack_device__(self) -> Tuple[_Device, int]:
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize,
-                                      'ptr': self.ptr,
-                                      'dlpack': self.__dlpack__(),
-                                      'device': self.__dlpack_device__()[0].name}
-                                      ) + ')'
+                                                    'ptr': self.ptr,
+                                                    'dlpack':
+                                                    self.__dlpack__(),
+                                                    'device':
+                                                    self.__dlpack_device__()[0]
+                                                        .name})
+        + ')'
+
 
 class _CuDFColumn:
     """
@@ -112,7 +115,7 @@ class _CuDFColumn:
     """
 
     def __init__(self, column: cudf.core.column.ColumnBase,
-                 nan_as_null : bool = True, 
+                 nan_as_null : bool = True,
                  allow_copy: bool = True) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
@@ -144,7 +147,8 @@ def offset(self) -> int:
     @property
     def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
         """
-        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
+        Dtype description as a tuple
+        ``(kind, bit-width, format string, endianness)``
 
         Kind :
 
@@ -163,27 +167,28 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
 
         Notes:
 
-            - Kind specifiers are aligned with DLPack where possible (hence the
-              jump to 20, leave enough room for future extension)
-            - Masks must be specified as boolean with either bit width 1 (for bit
-              masks) or 8 (for byte masks).
+            - Kind specifiers are aligned with DLPack where possible
+             (hence the jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1
+             (for bit masks) or 8 (for byte masks).
             - Dtype width in bits was preferred over bytes
-            - Endianness isn't too useful, but included now in case in the future
-              we need to support non-native endianness
+            - Endianness isn't too useful, but included now in case
+              in the future we need to support non-native endianness
             - Went with Apache Arrow format strings over NumPy format strings
               because they're more complete from a dataframe perspective
-            - Format strings are mostly useful for datetime specification, and
-              for categoricals.
+            - Format strings are mostly useful for datetime specification,
+              and for categoricals.
             - For categoricals, the format string describes the type of the
-              categorical in the data buffer. In case of a separate encoding of
-              the categorical (e.g. an integer to string mapping), this can
-              be derived from ``self.describe_categorical``.
-            - Data types not included: complex, Arrow-style null, binary, decimal,
-              and nested (list, struct, map, union) dtypes.
+              categorical in the data buffer. In case of a separate encoding
+              of the categorical (e.g. an integer to string mapping),
+              this can be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null,
+              binary, decimal, and nested (list, struct, map, union) dtypes.
         """
         dtype = self._col.dtype
 
-        # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings
+        # For now, assume that, if the column dtype is 'O' (i.e., `object`),
+        # then we have an array of strings
         if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == 'O':
             return (_DtypeKind.STRING, 8, 'u', '=')
 
@@ -194,8 +199,9 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
         See `self.dtype` for details.
         """
         # Note: 'c' (complex) not handled yet (not in array spec v1).
-        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
-        #       datetime and timedelta both map to datetime (is timedelta handled?)
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
         _k = _DtypeKind
         _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL,
                      "U": _k.STRING,
@@ -206,7 +212,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
             if isinstance(dtype, cudf.CategoricalDtype):
                 kind = _k.CATEGORICAL
                 # Codes and categories' dtypes are different.
-                # We use codes' dtype as these are stored in the buffer. 
+                # We use codes' dtype as these are stored in the buffer.
                 dtype = self._col.codes.dtype
             else:
                 raise ValueError(f"Data type {dtype} not supported by exchange"
@@ -232,23 +238,21 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
 
         Content of returned dict:
 
-            - "is_ordered" : bool, whether the ordering of dictionary indices is
-                             semantically meaningful.
+            - "is_ordered" : bool, whether the ordering of dictionary
+                             indices is semantically meaningful.
             - "is_dictionary" : bool, whether a dictionary-style mapping of
                                 categorical values to other objects exists
             - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
                           None if not a dictionary-style categorical.
         """
         if not self.dtype[0] == _DtypeKind.CATEGORICAL:
-            raise TypeError("`describe_categorical only works on a column with "
-                            "categorical dtype!")
+            raise TypeError("`describe_categorical only works on "
+                            "a column with categorical dtype!")
 
         ordered = self._col.dtype.ordered
         is_dictionary = True
         # NOTE: this shows the children approach is better, transforming
         # `categories` to a "mapping" dict is inefficient
-        codes = self._col.codes  # ndarray, length `self.size`
-        # categories.values is ndarray of length n_categories
         categories = self._col.categories
         mapping = {ix: val for ix, val in enumerate(categories.values_host)}
         return ordered, is_dictionary, mapping
@@ -267,11 +271,11 @@ def describe_null(self) -> Tuple[int, Any]:
             - 3 : bit mask
             - 4 : byte mask
 
-        Value : if kind is "sentinel value", the actual value.  If kind is a bit
-        mask or a byte mask, the value (0 or 1) indicating a missing value. None
-        otherwise.
+        Value : if kind is "sentinel value", the actual value.
+        If kind is a bit mask or a byte mask, the value (0 or 1)
+        indicating a missing value.
+        None otherwise.
         """
-        _k = _DtypeKind
         kind = self.dtype[0]
         if self.null_count == 0:
             # there is no validity mask so it is non-nullable
@@ -280,9 +284,10 @@ def describe_null(self) -> Tuple[int, Any]:
         elif kind in _SUPPORTED_KINDS:
             # bit mask is universally used in cudf for missing
             return 3, 0
-            
+
         else:
-            raise NotImplementedError(f"Data type {self.dtype} not yet supported")
+            raise NotImplementedError(f"Data type {self.dtype}"
+                                      " not yet supported")
 
     @property
     def null_count(self) -> int:
@@ -304,7 +309,8 @@ def num_chunks(self) -> int:
         """
         return 1
 
-    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFColumn']:
+    def get_chunks(self, n_chunks : Optional[int] = None) ->\
+            Iterable['_CuDFColumn']:
         """
         Return an iterable yielding the chunks.
 
@@ -337,74 +343,84 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]:
         buffers["data"] = self._get_data_buffer()
         try:
             buffers["validity"] = self._get_validity_buffer()
-        except:
+        except RuntimeError:
             buffers["validity"] = None
 
         try:
             buffers["offsets"] = self._get_offsets_buffer()
-        except:
+        except RuntimeError:
             buffers["offsets"] = None
 
         return buffers
 
-    def _get_data_buffer(self) -> Tuple[_CuDFBuffer, Any]:  # Any is for self.dtype tuple
+    def _get_data_buffer(self) -> Tuple[_CuDFBuffer,
+                                        Tuple[_DtypeKind, int, str, str]]:
         """
-        Return the buffer containing the data and the buffer's associated dtype.
+        Return the buffer containing the data and
+               the buffer's associated dtype.
         """
         _k = _DtypeKind
-        invalid = self.describe_null[1]
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _CuDFBuffer(self._col.data, self._col.dtype, 
+            buffer = _CuDFBuffer(self._col.data, self._col.dtype,
                                  allow_copy=self._allow_copy)
             dtype = self.dtype
 
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.codes
-            buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype, 
+            buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype,
                                  allow_copy=self._allow_copy)
             dtype = self._dtype_from_cudfdtype(codes.dtype)
 
         elif self.dtype[0] == _k.STRING:
             encoded_string = self._col.children[1]
-            buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype, allow_copy=self._allow_copy)
-            dtype = self._dtype_from_cudfdtype(encoded_string.dtype) 
+            buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype,
+                                 allow_copy=self._allow_copy)
+            dtype = self._dtype_from_cudfdtype(encoded_string.dtype)
 
         else:
-            raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
+            raise NotImplementedError(f"Data type {self._col.dtype}"
+                                      " not handled yet")
 
         return buffer, dtype
 
     def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         """
-        Return the buffer containing the mask values indicating missing data and
-        the buffer's associated dtype.
+        Return the buffer containing the mask values
+        indicating missing data and the buffer's associated dtype.
 
         Raises RuntimeError if null representation is not a bit or byte mask.
         """
-        
+
         null, invalid = self.describe_null
         if null == 3:
             _k = _DtypeKind
             if self.dtype[0] == _k.CATEGORICAL:
-                buffer = _CuDFBuffer(self._col.codes._get_mask_as_column().data, cp.uint8, 
-                                     allow_copy=self._allow_copy)
+                buffer = _CuDFBuffer(self._col.codes.
+                                     _get_mask_as_column().data,
+                                     cp.uint8, allow_copy=self._allow_copy)
             else:
-                buffer = _CuDFBuffer(self._col._get_mask_as_column().data, cp.uint8,
+                buffer = _CuDFBuffer(self._col.
+                                     _get_mask_as_column().data,
+                                     cp.uint8,
                                      allow_copy=self._allow_copy)
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
 
         elif null == 1:
-            raise RuntimeError("This column uses NaN as null so does not have a separate mask")
-        elif null == 0:   
-            raise RuntimeError("This column is non-nullable so does not have a mask")
+            raise RuntimeError("This column uses NaN as null "
+                               "so does not have a separate mask")
+        elif null == 0:
+            raise RuntimeError("This column is non-nullable"
+                               " so does not have a mask")
         else:
-            raise NotImplementedError(f"See {self.__class__.__name__}.describe_null method.")
+            raise NotImplementedError(f"See {self.__class__.__name__}"
+                                      ".describe_null method.")
 
     def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         """
-        Return the buffer containing the offset values for variable-size binary
-        data (e.g., variable-length strings) and the buffer's associated dtype.
+        Return the buffer containing the offset values for
+        variable-size binary data (e.g., variable-length strings)
+        and the buffer's associated dtype.
 
         Raises RuntimeError if the data buffer does not have an associated
         offsets buffer.
@@ -412,10 +428,12 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         _k = _DtypeKind
         if self.dtype[0] == _k.STRING:
             offsets = self._col.children[0]
-            buffer = _CuDFBuffer(offsets.data, offsets.dtype, allow_copy=self._allow_copy)
-            dtype = self._dtype_from_cudfdtype(offsets.dtype) 
+            buffer = _CuDFBuffer(offsets.data, offsets.dtype,
+                                 allow_copy=self._allow_copy)
+            dtype = self._dtype_from_cudfdtype(offsets.dtype)
         else:
-            raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
+            raise RuntimeError("This column has a fixed-length dtype "
+                               "so does not have an offsets buffer")
 
         return buffer, dtype
 
@@ -438,7 +456,8 @@ def __init__(self, df : 'cudf.core.dataframe.DataFrame',
         """
         self._df = df
         # ``nan_as_null`` is a keyword intended for the consumer to tell the
-        # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+        # producer to overwrite null values in the data with
+        # ``NaN`` (or ``NaT``).
         # This currently has no effect; once support for nullable extension
         # dtypes is added, this value should be propagated to columns.
         self._nan_as_null = nan_as_null
@@ -447,7 +466,7 @@ def __init__(self, df : 'cudf.core.dataframe.DataFrame',
     @property
     def metadata(self):
         # `index` isn't a regular column, and the protocol doesn't support row
-        # labels - so we export it as Pandas-specific metadata here.
+        # labels - so we export it as cuDF-specific metadata here.
         return {"cudf.index": self._df.index}
 
     def num_columns(self) -> int:
@@ -480,14 +499,16 @@ def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame':
 
         return _CuDFDataFrame(self._df.iloc[:, indices])
 
-    def select_columns_by_name(self, names: Sequence[str]) -> '_CuDFDataFrame':
+    def select_columns_by_name(self, names: Sequence[str]) ->\
+            '_CuDFDataFrame':
         if not isinstance(names, collections.Sequence):
             raise ValueError("`names` is not a sequence")
 
         return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null,
-                                self._allow_copy)
+                              self._allow_copy)
 
-    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_CuDFDataFrame']:
+    def get_chunks(self, n_chunks : Optional[int] = None) -> \
+            Iterable['_CuDFDataFrame']:
         """
         Return an iterator yielding the chunks.
         """
@@ -528,8 +549,8 @@ def __dataframe__(self, nan_as_null : bool = False,
 -----
 
 - Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
-  do in pure Python. It's more general but definitely less friendly than having
-  ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  do in pure Python. It's more general but definitely less friendly than
+  having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
   ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
   this is worth looking at again.
 
@@ -548,7 +569,8 @@ def __dataframe__(self, nan_as_null : bool = False,
 _CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
 
 
-def from_dataframe(df : DataFrameObject, allow_copy: bool = False) -> _CuDFDataFrame :
+def from_dataframe(df : DataFrameObject, allow_copy: bool = False) ->\
+        _CuDFDataFrame :
     """
     Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
     """
@@ -577,17 +599,18 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
         col = df.get_column_by_name(name)
 
         if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            columns[name], _buf = _protocol_column_to_cudf_column_numeric(col)
+            columns[name], _buf = _protocol_to_cudf_column_numeric(col)
 
         elif col.dtype[0] == _k.CATEGORICAL:
-            columns[name], _buf = _protocol_column_to_cudf_column_categorical(col)
+            columns[name], _buf = _protocol_to_cudf_column_categorical(col)
 
         elif col.dtype[0] == _k.STRING:
-            columns[name], _buf = _protocol_column_to_cudf_column_string(col)
-            
+            columns[name], _buf = _protocol_to_cudf_column_string(col)
+
         else:
-            raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
-        
+            raise NotImplementedError(f"Data type {col.dtype[0]}"
+                                      " not handled yet")
+
         _buffers.append(_buf)
 
     df_new = cudf.DataFrame._from_data(columns)
@@ -595,71 +618,80 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
     return df_new
 
 
-def _protocol_column_to_cudf_column_numeric(col:ColumnObject) -> \
-                    Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]:
+def _protocol_to_cudf_column_numeric(col: ColumnObject) -> \
+        Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]:
     """
-    Convert an int, uint, float or bool protocol column to the corresponding cudf column
+    Convert an int, uint, float or bool protocol column
+    to the corresponding cudf column
     """
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
     _dbuffer, _ddtype = col.get_buffers()['data']
     _check_buffer_is_on_gpu(_dbuffer)
-    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize), 
-                        protocol_dtype_to_cupy_dtype(_ddtype))        
+    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize),
+                        protocol_dtype_to_cupy_dtype(_ddtype))
     return _set_missing_values(col, dcol), _dbuffer
 
 
 def _check_buffer_is_on_gpu(buffer : _CuDFBuffer) -> None:
-    if buffer.__dlpack_device__()[0] != _Device.CUDA and not buffer._allow_copy:
+    if buffer.__dlpack_device__()[0] != _Device.CUDA and \
+            not buffer._allow_copy:
         raise TypeError("This operation must copy data from CPU to GPU. "
-                            "Set `allow_copy=True` to allow it.")
+                        "Set `allow_copy=True` to allow it.")
+
 
-def _set_missing_values(protocol_col: _CuDFColumn, 
-                        cudf_col:'cudf.core.dataframe.DataFrame') \
-                        -> cudf.core.column.ColumnBase:
+def _set_missing_values(protocol_col: _CuDFColumn,
+                        cudf_col: 'cudf.core.dataframe.DataFrame') -> \
+        cudf.core.column.ColumnBase:
 
     null_kind, null_value = protocol_col.describe_null
     if null_kind != 0:
-        assert null_kind == 3, f"cudf supports only bit mask, null_kind should be 3, got: {null_kind}." 
+        assert null_kind == 3, "cudf supports only bit mask, "
+        f"null_kind should be 3, got: {null_kind}."
         _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
-        bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8) 
+        bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize),
+                             cp.bool8)
         cudf_col[~bitmask] = None
 
     return cudf_col
 
+
 def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype:
     kind = _dtype[0]
     bitwidth = _dtype[1]
-    _k = _DtypeKind
     if _dtype[0] not in _SUPPORTED_KINDS:
         raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
-   
+
     return _CP_DTYPES[kind][bitwidth]
 
-def _protocol_column_to_cudf_column_categorical(col : ColumnObject) -> \
-    Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] :
+
+def _protocol_to_cudf_column_categorical(col : ColumnObject) -> \
+        Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] :
     """
     Convert a categorical column to a Series instance
     """
     ordered, is_dict, mapping = col.describe_categorical
     if not is_dict:
-        raise NotImplementedError('Non-dictionary categoricals not supported yet')
+        raise NotImplementedError("Non-dictionary categoricals"
+                                  " not supported yet")
 
     categories = as_column(mapping.values())
     codes_buffer, codes_dtype = col.get_buffers()['data']
     _check_buffer_is_on_gpu(codes_buffer)
     cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
-    codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype)
-    
-    cudfcol = build_categorical_column(categories=categories,codes=codes,mask=codes.base_mask,
-                                    size=codes.size,ordered=ordered)
+    codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize),
+                         cdtype)
+
+    cudfcol = build_categorical_column(categories=categories, codes=codes,
+                                       mask=codes.base_mask, size=codes.size,
+                                       ordered=ordered)
 
     return _set_missing_values(col, cudfcol), codes_buffer
 
 
-def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \
-    Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] :
+def _protocol_to_cudf_column_string(col : ColumnObject) -> \
+        Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] :
     """
     Convert a string ColumnObject to cudf Column object.
     """
@@ -670,16 +702,16 @@ def _protocol_column_to_cudf_column_string(col : ColumnObject) -> \
     data_buffer, data_dtype = buffers["data"]
     _check_buffer_is_on_gpu(data_buffer)
     encoded_string = build_column(Buffer(data_buffer.ptr, data_buffer.bufsize),
-                        protocol_dtype_to_cupy_dtype(data_dtype)
-                        )
+                                  protocol_dtype_to_cupy_dtype(data_dtype))
 
-    # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
+    # Retrieve the offsets buffer containing the index offsets demarcating
+    # the beginning and end of each string
     offset_buffer, offset_dtype = buffers["offsets"]
     _check_buffer_is_on_gpu(offset_buffer)
-    offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize), 
-                           protocol_dtype_to_cupy_dtype(offset_dtype)
-                           )
-    
-    col_str = build_column(None, dtype=cp.dtype('O'), children=(offsets, encoded_string))
+    offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize),
+                           protocol_dtype_to_cupy_dtype(offset_dtype))
+
+    col_str = build_column(None, dtype=cp.dtype('O'),
+                           children=(offsets, encoded_string))
 
     return _set_missing_values(col, col_str), buffers

From 117e4321bb2b51c8c543ba5241234125997967ac Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 2 Nov 2021 08:57:06 +0000
Subject: [PATCH 40/60] fix flake8 style checks for 'test_df_protocol.py' file

---
 python/cudf/cudf/tests/test_df_protocol.py | 88 ++++++++++------------
 1 file changed, 41 insertions(+), 47 deletions(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index d66b0751780..a80b94dc419 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -1,12 +1,9 @@
-import datetime
 import cupy as cp
-import numpy as np
 import pytest
 from cudf.core.df_protocol import (
-    _from_dataframe, 
+    _from_dataframe,
     _DtypeKind,
     protocol_dtype_to_cupy_dtype,
-    _CuDFDataFrame,
     _CuDFColumn,
     _CuDFBuffer
 )
@@ -14,33 +11,24 @@
 import cudf
 from cudf.core.column import build_column
 from cudf.core.buffer import Buffer
-from cudf.testing import _utils as utils
-from cudf.testing._utils import (
-    ALL_TYPES,
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    assert_eq,
-    assert_exceptions_equal,
-    does_not_raise,
-    gen_rand,
-)
+from cudf.testing._utils import assert_eq
 import pandas as pd
 from typing import Any, Tuple
 
 DataFrameObject = Any
 
+
 def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     buf, dtype = buffer_and_dtype
     device_id = cp.asarray(cudfcol.data).device.id
     assert buf.__dlpack_device__() == (2, device_id)
     col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize),
-                        protocol_dtype_to_cupy_dtype(dtype)
-                        )
+                                protocol_dtype_to_cupy_dtype(dtype))
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.
-    non_null_idxs = cudfcol != None
+    non_null_idxs = cudfcol != cudf.NA
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
-    
+
     if dtype[0] != _DtypeKind.BOOL:
         array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
         col_array = cp.asarray(cudfcol.data_array_view)
@@ -49,42 +37,37 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
         pytest.raises(TypeError, buf.__dlpack__)
 
 
-
-
 def assert_column_equal(col: _CuDFColumn, cudfcol):
-    assert col.size == cudfcol.size 
+    assert col.size == cudfcol.size
     assert col.offset == 0
     assert col.null_count == cudfcol.null_count
     assert col.num_chunks() == 1
     if col.null_count == 0 :
         pytest.raises(RuntimeError, col._get_validity_buffer)
-        assert col.get_buffers()['validity'] == None
+        assert col.get_buffers()['validity'] is None
     else:
         assert_buffer_equal(col.get_buffers()['validity'],
                             cudfcol._get_mask_as_column().astype(cp.uint8))
-    
+
     if col.dtype[0] == _DtypeKind.CATEGORICAL:
         assert_buffer_equal(col.get_buffers()['data'], cudfcol.codes)
-        assert col.get_buffers()['offsets'] == None
+        assert col.get_buffers()['offsets'] is None
 
     elif col.dtype[0] == _DtypeKind.STRING:
         assert_buffer_equal(col.get_buffers()['data'], cudfcol.children[1])
         assert_buffer_equal(col.get_buffers()['offsets'], cudfcol.children[0])
-        
+
     else:
         assert_buffer_equal(col.get_buffers()['data'], cudfcol)
-        assert col.get_buffers()['offsets'] == None
+        assert col.get_buffers()['offsets'] is None
 
-    null_kind, null_value = col.describe_null
     if col.null_count == 0:
-        assert null_kind == 0
-        assert null_value == None
+        assert col.describe_null == (0, None)
     else:
-        assert null_kind == 3
-        assert null_value == 0
+        assert col.describe_null == (3, 0)
 
 
-def assert_dataframe_equal(dfo: DataFrameObject, df:cudf.DataFrame):
+def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
     assert dfo.num_columns() == len(df.columns)
     assert dfo.num_rows() == len(df)
     assert dfo.num_chunks() == 1
@@ -108,29 +91,35 @@ def _test_from_dataframe_equals(dfobj):
 
 
 def _test_from_dataframe_exception(dfobj):
-    exception_msg = "This operation must copy data from CPU to GPU. Set `allow_copy=True` to allow it."
+    exception_msg = "This operation must copy data from CPU to GPU."
+    " Set `allow_copy=True` to allow it."
     with pytest.raises(TypeError, match=exception_msg):
-        df2 = _from_dataframe(dfobj)
+        _from_dataframe(dfobj)
+
 
 def _test_datatype(data):
     cdf = cudf.DataFrame(data=data)
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
 
+
 def test_from_dataframe():
     data = dict(a=[1, 2, 3], b=[9, 10, 11])
     df1 = cudf.DataFrame(data=data)
     df2 = cudf.from_dataframe(df1)
     assert_eq(df1, df2)
-    
+
+
 def test_int_dtype():
     data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
     _test_datatype(data_int)
 
+
 def test_float_dtype():
     data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
     _test_datatype(data_float)
 
+
 def test_categorical_dtype():
     cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
     cdf["A"] = cdf["A"].astype("category")
@@ -140,6 +129,7 @@ def test_categorical_dtype():
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
 
+
 def test_bool_dtype():
     data_bool = dict(a=[True, True, False], b=[False, True, False])
     _test_datatype(data_bool)
@@ -152,23 +142,25 @@ def test_string_dtype():
 
 def test_mixed_dtype():
     data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5],
-                        bool=[True, False, True], categorical=[5, 1, 5],
-                        string=["rapidsai-cudf ", "", "df protocol"])
+                      bool=[True, False, True], categorical=[5, 1, 5],
+                      string=["rapidsai-cudf ", "", "df protocol"])
     _test_datatype(data_mixed)
 
 
 def test_NA_int_dtype():
-    data_int = dict(a=[1, None, 3, None, 5], 
+    data_int = dict(a=[1, None, 3, None, 5],
                     b=[9, 10, None, 7, 8],
-                    c= [6, 19, 20, 100, 1000] )
+                    c=[6, 19, 20, 100, 1000])
     _test_datatype(data_int)
 
+
 def test_NA_float_dtype():
-    data_float = dict(a=[1.4, None, 3.6, None, 5.2], 
-                    b=[9.7, 10.9, None, 7.8, 8.2],
-                    c= [6.1, 19.2, 20.3, 100.4, 1000.5] )
+    data_float = dict(a=[1.4, None, 3.6, None, 5.2],
+                      b=[9.7, 10.9, None, 7.8, 8.2],
+                      c=[6.1, 19.2, 20.3, 100.4, 1000.5])
     _test_datatype(data_float)
 
+
 def test_NA_categorical_dtype():
     df = cudf.DataFrame({"A": [1, 2, 5, 1]})
     df["B"] = df["A"].astype("category")
@@ -184,10 +176,12 @@ def test_NA_categorical_dtype():
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
     _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))
 
+
 def test_NA_bool_dtype():
     data_bool = dict(a=[None, True, False], b=[False, None, None])
     _test_datatype(data_bool)
 
+
 def test_NA_string_dtype():
     df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]})
     df["B"] = df["A"].astype("object")
@@ -204,9 +198,9 @@ def test_NA_string_dtype():
 
 
 def test_NA_mixed_dtype():
-    data_mixed = dict(int=[1, None, 2, 3, 1000], float=[None, 1.5, 2.5, 3.5, None],
-                        bool=[True, None, False, None, None], 
-                        categorical=[5, 1, 5, 3, None],
-                        string=[None, None, None, "df protocol", None])
+    data_mixed = dict(int=[1, None, 2, 3, 1000],
+                      float=[None, 1.5, 2.5, 3.5, None],
+                      bool=[True, None, False, None, None],
+                      categorical=[5, 1, 5, 3, None],
+                      string=[None, None, None, "df protocol", None])
     _test_datatype(data_mixed)
-

From 5eb76420cd8084daf42bd40fdb59528185ded68a Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 2 Nov 2021 08:58:36 +0000
Subject: [PATCH 41/60] isort formatting

---
 python/cudf/cudf/core/df_protocol.py       | 12 ++++++------
 python/cudf/cudf/tests/test_df_protocol.py | 19 ++++++++++---------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 2bad99bea77..6285c283c02 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,14 +1,14 @@
-import enum
 import collections
-from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
+import enum
+from typing import Any, Dict, Iterable, Optional, Sequence, Tuple
 
-import cudf
-from cudf.core.column import as_column, build_column, build_categorical_column
-from cudf.core.buffer import Buffer
-import numpy as np
 import cupy as cp
+import numpy as np
 from numba import cuda
 
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import as_column, build_categorical_column, build_column
 
 # Implementation of interchange protocol classes
 # ----------------------------------------------
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index a80b94dc419..45fb30ce998 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -1,19 +1,20 @@
+from typing import Any, Tuple
+
 import cupy as cp
+import pandas as pd
 import pytest
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import build_column
 from cudf.core.df_protocol import (
-    _from_dataframe,
+    _CuDFBuffer,
+    _CuDFColumn,
     _DtypeKind,
+    _from_dataframe,
     protocol_dtype_to_cupy_dtype,
-    _CuDFColumn,
-    _CuDFBuffer
 )
-
-import cudf
-from cudf.core.column import build_column
-from cudf.core.buffer import Buffer
 from cudf.testing._utils import assert_eq
-import pandas as pd
-from typing import Any, Tuple
 
 DataFrameObject = Any
 

From e164540aa2ecddd1a400d6d9e9159e54f9f303dc Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 2 Nov 2021 09:09:32 +0000
Subject: [PATCH 42/60] run 'black' to format code.

---
 python/cudf/cudf/core/df_protocol.py       | 318 +++++++++++++--------
 python/cudf/cudf/tests/test_df_protocol.py |  71 +++--
 2 files changed, 238 insertions(+), 151 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 6285c283c02..debb0d29079 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -13,12 +13,13 @@
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
+
 class _DtypeKind(enum.IntEnum):
     INT = 0
     UINT = 1
     FLOAT = 2
     BOOL = 20
-    STRING = 21   # UTF-8
+    STRING = 21  # UTF-8
     DATETIME = 22
     CATEGORICAL = 23
 
@@ -35,8 +36,14 @@ class _Device(enum.IntEnum):
 
 
 _k = _DtypeKind
-_SUPPORTED_KINDS = (_k.INT, _k.UINT, _k.FLOAT, _k.CATEGORICAL,
-                    _k.BOOL, _k.STRING)
+_SUPPORTED_KINDS = (
+    _k.INT,
+    _k.UINT,
+    _k.FLOAT,
+    _k.CATEGORICAL,
+    _k.BOOL,
+    _k.STRING,
+)
 
 
 class _CuDFBuffer:
@@ -44,8 +51,12 @@ class _CuDFBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, buf : cudf.core.buffer.Buffer,
-                 dtype: np.dtype, allow_copy : bool = True) -> None:
+    def __init__(
+        self,
+        buf: cudf.core.buffer.Buffer,
+        dtype: np.dtype,
+        allow_copy: bool = True,
+    ) -> None:
         """
         Use cudf.core.buffer.Buffer object.
         """
@@ -69,7 +80,7 @@ def ptr(self) -> int:
         """
         return self._buf.ptr
 
-    def __dlpack__(self) :
+    def __dlpack__(self):
         """
         DLPack not implemented in NumPy yet, so leave it out here.
         """
@@ -78,7 +89,7 @@ def __dlpack__(self) :
             res = cp.asarray(cudarray).toDlpack()
 
         except ValueError:
-            raise TypeError(f'dtype {self._dtype} unsupported by `dlpack`')
+            raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`")
 
         return res
 
@@ -89,14 +100,15 @@ def __dlpack_device__(self) -> Tuple[_Device, int]:
         return (_Device.CUDA, cp.asarray(self._buf).device.id)
 
     def __repr__(self) -> str:
-        return f'{self.__class__.__name__}(' + str({'bufsize': self.bufsize,
-                                                    'ptr': self.ptr,
-                                                    'dlpack':
-                                                    self.__dlpack__(),
-                                                    'device':
-                                                    self.__dlpack_device__()[0]
-                                                        .name})
-        + ')'
+        return f"{self.__class__.__name__}(" + str(
+            {
+                "bufsize": self.bufsize,
+                "ptr": self.ptr,
+                "dlpack": self.__dlpack__(),
+                "device": self.__dlpack_device__()[0].name,
+            }
+        )
+        +")"
 
 
 class _CuDFColumn:
@@ -114,16 +126,20 @@ class _CuDFColumn:
 
     """
 
-    def __init__(self, column: cudf.core.column.ColumnBase,
-                 nan_as_null : bool = True,
-                 allow_copy: bool = True) -> None:
+    def __init__(
+        self,
+        column: cudf.core.column.ColumnBase,
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
         """
         if not isinstance(column, cudf.Series):
-            raise NotImplementedError("Columns of type {} not handled "
-                                      "yet".format(type(column)))
+            raise NotImplementedError(
+                "Columns of type {} not handled " "yet".format(type(column))
+            )
 
         # Store the column as a private attribute
         self._col = as_column(column)
@@ -189,8 +205,8 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
 
         # For now, assume that, if the column dtype is 'O' (i.e., `object`),
         # then we have an array of strings
-        if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == 'O':
-            return (_DtypeKind.STRING, 8, 'u', '=')
+        if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O":
+            return (_DtypeKind.STRING, 8, "u", "=")
 
         return self._dtype_from_cudfdtype(dtype)
 
@@ -203,9 +219,15 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
         #       not handled datetime and timedelta both map to datetime
         #       (is timedelta handled?)
         _k = _DtypeKind
-        _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL,
-                     "U": _k.STRING,
-                     "M": _k.DATETIME, "m": _k.DATETIME}
+        _np_kinds = {
+            "i": _k.INT,
+            "u": _k.UINT,
+            "f": _k.FLOAT,
+            "b": _k.BOOL,
+            "U": _k.STRING,
+            "M": _k.DATETIME,
+            "m": _k.DATETIME,
+        }
         kind = _np_kinds.get(dtype.kind, None)
         if kind is None:
             # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
@@ -215,15 +237,16 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
                 # We use codes' dtype as these are stored in the buffer.
                 dtype = self._col.codes.dtype
             else:
-                raise ValueError(f"Data type {dtype} not supported by exchange"
-                                 "protocol")
+                raise ValueError(
+                    f"Data type {dtype} not supported by exchange" "protocol"
+                )
 
         if kind not in _SUPPORTED_KINDS:
             raise NotImplementedError(f"Data type {dtype} not handled yet")
 
         bitwidth = dtype.itemsize * 8
         format_str = dtype.str
-        endianness = dtype.byteorder if kind != _k.CATEGORICAL else '='
+        endianness = dtype.byteorder if kind != _k.CATEGORICAL else "="
         return (kind, bitwidth, format_str, endianness)
 
     @property
@@ -246,8 +269,10 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
                           None if not a dictionary-style categorical.
         """
         if not self.dtype[0] == _DtypeKind.CATEGORICAL:
-            raise TypeError("`describe_categorical only works on "
-                            "a column with categorical dtype!")
+            raise TypeError(
+                "`describe_categorical only works on "
+                "a column with categorical dtype!"
+            )
 
         ordered = self._col.dtype.ordered
         is_dictionary = True
@@ -286,8 +311,9 @@ def describe_null(self) -> Tuple[int, Any]:
             return 3, 0
 
         else:
-            raise NotImplementedError(f"Data type {self.dtype}"
-                                      " not yet supported")
+            raise NotImplementedError(
+                f"Data type {self.dtype}" " not yet supported"
+            )
 
     @property
     def null_count(self) -> int:
@@ -309,8 +335,9 @@ def num_chunks(self) -> int:
         """
         return 1
 
-    def get_chunks(self, n_chunks : Optional[int] = None) ->\
-            Iterable['_CuDFColumn']:
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFColumn"]:
         """
         Return an iterable yielding the chunks.
 
@@ -353,33 +380,42 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]:
 
         return buffers
 
-    def _get_data_buffer(self) -> Tuple[_CuDFBuffer,
-                                        Tuple[_DtypeKind, int, str, str]]:
+    def _get_data_buffer(
+        self,
+    ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]:
         """
         Return the buffer containing the data and
                the buffer's associated dtype.
         """
         _k = _DtypeKind
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _CuDFBuffer(self._col.data, self._col.dtype,
-                                 allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(
+                self._col.data, self._col.dtype, allow_copy=self._allow_copy
+            )
             dtype = self.dtype
 
         elif self.dtype[0] == _k.CATEGORICAL:
             codes = self._col.codes
-            buffer = _CuDFBuffer(self._col.codes.data, self._col.codes.dtype,
-                                 allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(
+                self._col.codes.data,
+                self._col.codes.dtype,
+                allow_copy=self._allow_copy,
+            )
             dtype = self._dtype_from_cudfdtype(codes.dtype)
 
         elif self.dtype[0] == _k.STRING:
             encoded_string = self._col.children[1]
-            buffer = _CuDFBuffer(encoded_string.data, encoded_string.dtype,
-                                 allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(
+                encoded_string.data,
+                encoded_string.dtype,
+                allow_copy=self._allow_copy,
+            )
             dtype = self._dtype_from_cudfdtype(encoded_string.dtype)
 
         else:
-            raise NotImplementedError(f"Data type {self._col.dtype}"
-                                      " not handled yet")
+            raise NotImplementedError(
+                f"Data type {self._col.dtype}" " not handled yet"
+            )
 
         return buffer, dtype
 
@@ -395,26 +431,33 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         if null == 3:
             _k = _DtypeKind
             if self.dtype[0] == _k.CATEGORICAL:
-                buffer = _CuDFBuffer(self._col.codes.
-                                     _get_mask_as_column().data,
-                                     cp.uint8, allow_copy=self._allow_copy)
+                buffer = _CuDFBuffer(
+                    self._col.codes._get_mask_as_column().data,
+                    cp.uint8,
+                    allow_copy=self._allow_copy,
+                )
             else:
-                buffer = _CuDFBuffer(self._col.
-                                     _get_mask_as_column().data,
-                                     cp.uint8,
-                                     allow_copy=self._allow_copy)
+                buffer = _CuDFBuffer(
+                    self._col._get_mask_as_column().data,
+                    cp.uint8,
+                    allow_copy=self._allow_copy,
+                )
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
 
         elif null == 1:
-            raise RuntimeError("This column uses NaN as null "
-                               "so does not have a separate mask")
+            raise RuntimeError(
+                "This column uses NaN as null "
+                "so does not have a separate mask"
+            )
         elif null == 0:
-            raise RuntimeError("This column is non-nullable"
-                               " so does not have a mask")
+            raise RuntimeError(
+                "This column is non-nullable" " so does not have a mask"
+            )
         else:
-            raise NotImplementedError(f"See {self.__class__.__name__}"
-                                      ".describe_null method.")
+            raise NotImplementedError(
+                f"See {self.__class__.__name__}" ".describe_null method."
+            )
 
     def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         """
@@ -428,12 +471,15 @@ def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
         _k = _DtypeKind
         if self.dtype[0] == _k.STRING:
             offsets = self._col.children[0]
-            buffer = _CuDFBuffer(offsets.data, offsets.dtype,
-                                 allow_copy=self._allow_copy)
+            buffer = _CuDFBuffer(
+                offsets.data, offsets.dtype, allow_copy=self._allow_copy
+            )
             dtype = self._dtype_from_cudfdtype(offsets.dtype)
         else:
-            raise RuntimeError("This column has a fixed-length dtype "
-                               "so does not have an offsets buffer")
+            raise RuntimeError(
+                "This column has a fixed-length dtype "
+                "so does not have an offsets buffer"
+            )
 
         return buffer, dtype
 
@@ -447,9 +493,13 @@ class _CuDFDataFrame:
     ``cudf.DataFrame.__dataframe__`` as objects with the methods and
     attributes defined on this class.
     """
-    def __init__(self, df : 'cudf.core.dataframe.DataFrame',
-                 nan_as_null : bool = True,
-                 allow_copy : bool = True) -> None:
+
+    def __init__(
+        self,
+        df: "cudf.core.dataframe.DataFrame",
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
         """
         Constructor - an instance of this (private) class is returned from
         `cudf.DataFrame.__dataframe__`.
@@ -482,41 +532,43 @@ def column_names(self) -> Iterable[str]:
         return self._df.columns.tolist()
 
     def get_column(self, i: int) -> _CuDFColumn:
-        return _CuDFColumn(
-            self._df.iloc[:, i], allow_copy=self._allow_copy)
+        return _CuDFColumn(self._df.iloc[:, i], allow_copy=self._allow_copy)
 
     def get_column_by_name(self, name: str) -> _CuDFColumn:
-        return _CuDFColumn(
-            self._df[name], allow_copy=self._allow_copy)
+        return _CuDFColumn(self._df[name], allow_copy=self._allow_copy)
 
     def get_columns(self) -> Iterable[_CuDFColumn]:
-        return [_CuDFColumn(self._df[name], allow_copy=self._allow_copy)
-                for name in self._df.columns]
+        return [
+            _CuDFColumn(self._df[name], allow_copy=self._allow_copy)
+            for name in self._df.columns
+        ]
 
-    def select_columns(self, indices: Sequence[int]) -> '_CuDFDataFrame':
+    def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
         if not isinstance(indices, collections.abc.Sequence):
             raise ValueError("`indices` is not a sequence")
 
         return _CuDFDataFrame(self._df.iloc[:, indices])
 
-    def select_columns_by_name(self, names: Sequence[str]) ->\
-            '_CuDFDataFrame':
+    def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
         if not isinstance(names, collections.Sequence):
             raise ValueError("`names` is not a sequence")
 
-        return _CuDFDataFrame(self._df.loc[:, names], self._nan_as_null,
-                              self._allow_copy)
+        return _CuDFDataFrame(
+            self._df.loc[:, names], self._nan_as_null, self._allow_copy
+        )
 
-    def get_chunks(self, n_chunks : Optional[int] = None) -> \
-            Iterable['_CuDFDataFrame']:
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFDataFrame"]:
         """
         Return an iterator yielding the chunks.
         """
         return (self,)
 
 
-def __dataframe__(self, nan_as_null : bool = False,
-                  allow_copy : bool = True) -> _CuDFDataFrame:
+def __dataframe__(
+    self, nan_as_null: bool = False, allow_copy: bool = True
+) -> _CuDFDataFrame:
     """
     The public method to attach to cudf.DataFrame.
 
@@ -532,8 +584,7 @@ def __dataframe__(self, nan_as_null : bool = False,
     Currently, if this flag is set to ``False`` and a copy is needed, a
     ``RuntimeError`` will be raised.
     """
-    return _CuDFDataFrame(
-        self, nan_as_null=nan_as_null, allow_copy=allow_copy)
+    return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy)
 
 
 """
@@ -569,21 +620,22 @@ def __dataframe__(self, nan_as_null : bool = False,
 _CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
 
 
-def from_dataframe(df : DataFrameObject, allow_copy: bool = False) ->\
-        _CuDFDataFrame :
+def from_dataframe(
+    df: DataFrameObject, allow_copy: bool = False
+) -> _CuDFDataFrame:
     """
     Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
     """
     if isinstance(df, cudf.DataFrame):
         return df
 
-    if not hasattr(df, '__dataframe__'):
+    if not hasattr(df, "__dataframe__"):
         raise ValueError("`df` does not support __dataframe__")
 
     return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
-def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
+def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
     """
     Create a cudf DataFrame object from DataFrameObject.
     """
@@ -608,8 +660,9 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
             columns[name], _buf = _protocol_to_cudf_column_string(col)
 
         else:
-            raise NotImplementedError(f"Data type {col.dtype[0]}"
-                                      " not handled yet")
+            raise NotImplementedError(
+                f"Data type {col.dtype[0]}" " not handled yet"
+            )
 
         _buffers.append(_buf)
 
@@ -618,8 +671,9 @@ def _from_dataframe(df : DataFrameObject) -> _CuDFDataFrame :
     return df_new
 
 
-def _protocol_to_cudf_column_numeric(col: ColumnObject) -> \
-        Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]:
+def _protocol_to_cudf_column_numeric(
+    col: ColumnObject,
+) -> Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]:
     """
     Convert an int, uint, float or bool protocol column
     to the corresponding cudf column
@@ -627,31 +681,38 @@ def _protocol_to_cudf_column_numeric(col: ColumnObject) -> \
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
-    _dbuffer, _ddtype = col.get_buffers()['data']
+    _dbuffer, _ddtype = col.get_buffers()["data"]
     _check_buffer_is_on_gpu(_dbuffer)
-    dcol = build_column(Buffer(_dbuffer.ptr, _dbuffer.bufsize),
-                        protocol_dtype_to_cupy_dtype(_ddtype))
+    dcol = build_column(
+        Buffer(_dbuffer.ptr, _dbuffer.bufsize),
+        protocol_dtype_to_cupy_dtype(_ddtype),
+    )
     return _set_missing_values(col, dcol), _dbuffer
 
 
-def _check_buffer_is_on_gpu(buffer : _CuDFBuffer) -> None:
-    if buffer.__dlpack_device__()[0] != _Device.CUDA and \
-            not buffer._allow_copy:
-        raise TypeError("This operation must copy data from CPU to GPU. "
-                        "Set `allow_copy=True` to allow it.")
+def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
+    if (
+        buffer.__dlpack_device__()[0] != _Device.CUDA
+        and not buffer._allow_copy
+    ):
+        raise TypeError(
+            "This operation must copy data from CPU to GPU. "
+            "Set `allow_copy=True` to allow it."
+        )
 
 
-def _set_missing_values(protocol_col: _CuDFColumn,
-                        cudf_col: 'cudf.core.dataframe.DataFrame') -> \
-        cudf.core.column.ColumnBase:
+def _set_missing_values(
+    protocol_col: _CuDFColumn, cudf_col: "cudf.core.dataframe.DataFrame"
+) -> cudf.core.column.ColumnBase:
 
     null_kind, null_value = protocol_col.describe_null
     if null_kind != 0:
         assert null_kind == 3, "cudf supports only bit mask, "
         f"null_kind should be 3, got: {null_kind}."
         _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
-        bitmask = cp.asarray(Buffer(_mask_buffer.ptr, _mask_buffer.bufsize),
-                             cp.bool8)
+        bitmask = cp.asarray(
+            Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8
+        )
         cudf_col[~bitmask] = None
 
     return cudf_col
@@ -666,32 +727,40 @@ def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype:
     return _CP_DTYPES[kind][bitwidth]
 
 
-def _protocol_to_cudf_column_categorical(col : ColumnObject) -> \
-        Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer] :
+def _protocol_to_cudf_column_categorical(
+    col: ColumnObject,
+) -> Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer]:
     """
     Convert a categorical column to a Series instance
     """
     ordered, is_dict, mapping = col.describe_categorical
     if not is_dict:
-        raise NotImplementedError("Non-dictionary categoricals"
-                                  " not supported yet")
+        raise NotImplementedError(
+            "Non-dictionary categoricals" " not supported yet"
+        )
 
     categories = as_column(mapping.values())
-    codes_buffer, codes_dtype = col.get_buffers()['data']
+    codes_buffer, codes_dtype = col.get_buffers()["data"]
     _check_buffer_is_on_gpu(codes_buffer)
     cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
-    codes = build_column(Buffer(codes_buffer.ptr, codes_buffer.bufsize),
-                         cdtype)
-
-    cudfcol = build_categorical_column(categories=categories, codes=codes,
-                                       mask=codes.base_mask, size=codes.size,
-                                       ordered=ordered)
+    codes = build_column(
+        Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype
+    )
+
+    cudfcol = build_categorical_column(
+        categories=categories,
+        codes=codes,
+        mask=codes.base_mask,
+        size=codes.size,
+        ordered=ordered,
+    )
 
     return _set_missing_values(col, cudfcol), codes_buffer
 
 
-def _protocol_to_cudf_column_string(col : ColumnObject) -> \
-        Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]] :
+def _protocol_to_cudf_column_string(
+    col: ColumnObject,
+) -> Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]]:
     """
     Convert a string ColumnObject to cudf Column object.
     """
@@ -701,17 +770,22 @@ def _protocol_to_cudf_column_string(col : ColumnObject) -> \
     # Retrieve the data buffer containing the UTF-8 code units
     data_buffer, data_dtype = buffers["data"]
     _check_buffer_is_on_gpu(data_buffer)
-    encoded_string = build_column(Buffer(data_buffer.ptr, data_buffer.bufsize),
-                                  protocol_dtype_to_cupy_dtype(data_dtype))
+    encoded_string = build_column(
+        Buffer(data_buffer.ptr, data_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(data_dtype),
+    )
 
     # Retrieve the offsets buffer containing the index offsets demarcating
     # the beginning and end of each string
     offset_buffer, offset_dtype = buffers["offsets"]
     _check_buffer_is_on_gpu(offset_buffer)
-    offsets = build_column(Buffer(offset_buffer.ptr, offset_buffer.bufsize),
-                           protocol_dtype_to_cupy_dtype(offset_dtype))
-
-    col_str = build_column(None, dtype=cp.dtype('O'),
-                           children=(offsets, encoded_string))
+    offsets = build_column(
+        Buffer(offset_buffer.ptr, offset_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(offset_dtype),
+    )
+
+    col_str = build_column(
+        None, dtype=cp.dtype("O"), children=(offsets, encoded_string)
+    )
 
     return _set_missing_values(col, col_str), buffers
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 45fb30ce998..88e040cfdf3 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -23,8 +23,9 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     buf, dtype = buffer_and_dtype
     device_id = cp.asarray(cudfcol.data).device.id
     assert buf.__dlpack_device__() == (2, device_id)
-    col_from_buf = build_column(Buffer(buf.ptr, buf.bufsize),
-                                protocol_dtype_to_cupy_dtype(dtype))
+    col_from_buf = build_column(
+        Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype)
+    )
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.
     non_null_idxs = cudfcol != cudf.NA
@@ -43,24 +44,26 @@ def assert_column_equal(col: _CuDFColumn, cudfcol):
     assert col.offset == 0
     assert col.null_count == cudfcol.null_count
     assert col.num_chunks() == 1
-    if col.null_count == 0 :
+    if col.null_count == 0:
         pytest.raises(RuntimeError, col._get_validity_buffer)
-        assert col.get_buffers()['validity'] is None
+        assert col.get_buffers()["validity"] is None
     else:
-        assert_buffer_equal(col.get_buffers()['validity'],
-                            cudfcol._get_mask_as_column().astype(cp.uint8))
+        assert_buffer_equal(
+            col.get_buffers()["validity"],
+            cudfcol._get_mask_as_column().astype(cp.uint8),
+        )
 
     if col.dtype[0] == _DtypeKind.CATEGORICAL:
-        assert_buffer_equal(col.get_buffers()['data'], cudfcol.codes)
-        assert col.get_buffers()['offsets'] is None
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes)
+        assert col.get_buffers()["offsets"] is None
 
     elif col.dtype[0] == _DtypeKind.STRING:
-        assert_buffer_equal(col.get_buffers()['data'], cudfcol.children[1])
-        assert_buffer_equal(col.get_buffers()['offsets'], cudfcol.children[0])
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1])
+        assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0])
 
     else:
-        assert_buffer_equal(col.get_buffers()['data'], cudfcol)
-        assert col.get_buffers()['offsets'] is None
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol)
+        assert col.get_buffers()["offsets"] is None
 
     if col.null_count == 0:
         assert col.describe_null == (0, None)
@@ -124,7 +127,7 @@ def test_float_dtype():
 def test_categorical_dtype():
     cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
     cdf["A"] = cdf["A"].astype("category")
-    col = cdf.__dataframe__().get_column_by_name('A')
+    col = cdf.__dataframe__().get_column_by_name("A")
     assert col.dtype[0] == _DtypeKind.CATEGORICAL
     assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
     _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
@@ -142,33 +145,41 @@ def test_string_dtype():
 
 
 def test_mixed_dtype():
-    data_mixed = dict(int=[1, 2, 3], float=[1.5, 2.5, 3.5],
-                      bool=[True, False, True], categorical=[5, 1, 5],
-                      string=["rapidsai-cudf ", "", "df protocol"])
+    data_mixed = dict(
+        int=[1, 2, 3],
+        float=[1.5, 2.5, 3.5],
+        bool=[True, False, True],
+        categorical=[5, 1, 5],
+        string=["rapidsai-cudf ", "", "df protocol"],
+    )
     _test_datatype(data_mixed)
 
 
 def test_NA_int_dtype():
-    data_int = dict(a=[1, None, 3, None, 5],
-                    b=[9, 10, None, 7, 8],
-                    c=[6, 19, 20, 100, 1000])
+    data_int = dict(
+        a=[1, None, 3, None, 5],
+        b=[9, 10, None, 7, 8],
+        c=[6, 19, 20, 100, 1000],
+    )
     _test_datatype(data_int)
 
 
 def test_NA_float_dtype():
-    data_float = dict(a=[1.4, None, 3.6, None, 5.2],
-                      b=[9.7, 10.9, None, 7.8, 8.2],
-                      c=[6.1, 19.2, 20.3, 100.4, 1000.5])
+    data_float = dict(
+        a=[1.4, None, 3.6, None, 5.2],
+        b=[9.7, 10.9, None, 7.8, 8.2],
+        c=[6.1, 19.2, 20.3, 100.4, 1000.5],
+    )
     _test_datatype(data_float)
 
 
 def test_NA_categorical_dtype():
     df = cudf.DataFrame({"A": [1, 2, 5, 1]})
     df["B"] = df["A"].astype("category")
-    df.at[[1, 3], 'B'] = None  # Set two items to null
+    df.at[[1, 3], "B"] = None  # Set two items to null
 
     # Some detailed testing for correctness of dtype and null handling:
-    col = df.__dataframe__().get_column_by_name('B')
+    col = df.__dataframe__().get_column_by_name("B")
     assert col.dtype[0] == _DtypeKind.CATEGORICAL
     assert col.null_count == 2
     assert col.describe_null == (3, 0)
@@ -199,9 +210,11 @@ def test_NA_string_dtype():
 
 
 def test_NA_mixed_dtype():
-    data_mixed = dict(int=[1, None, 2, 3, 1000],
-                      float=[None, 1.5, 2.5, 3.5, None],
-                      bool=[True, None, False, None, None],
-                      categorical=[5, 1, 5, 3, None],
-                      string=[None, None, None, "df protocol", None])
+    data_mixed = dict(
+        int=[1, None, 2, 3, 1000],
+        float=[None, 1.5, 2.5, 3.5, None],
+        bool=[True, None, False, None, None],
+        categorical=[5, 1, 5, 3, None],
+        string=[None, None, None, "df protocol", None],
+    )
     _test_datatype(data_mixed)

From 8b34a860bd72ccb39314c72e33c21d67eb6655ef Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 2 Nov 2021 09:35:23 +0000
Subject: [PATCH 43/60] fix test errors.

---
 python/cudf/cudf/tests/test_df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 88e040cfdf3..fb96c6b3698 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -28,7 +28,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     )
     # check that non null values are the equals as null are represented
     # by sentinel values in the buffer.
-    non_null_idxs = cudfcol != cudf.NA
+    non_null_idxs = cudf.Series(cudfcol) != cudf.NA
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
 
     if dtype[0] != _DtypeKind.BOOL:

From eba1cdd8792b2ec892fbe783287608a776c0a62c Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 2 Nov 2021 09:36:13 +0000
Subject: [PATCH 44/60] minor style changes

---
 python/cudf/cudf/core/df_protocol.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index debb0d29079..458bdb962bb 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -138,7 +138,7 @@ def __init__(
         """
         if not isinstance(column, cudf.Series):
             raise NotImplementedError(
-                "Columns of type {} not handled " "yet".format(type(column))
+                "Columns of type {} not handled yet".format(type(column))
             )
 
         # Store the column as a private attribute
@@ -238,7 +238,7 @@ def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
                 dtype = self._col.codes.dtype
             else:
                 raise ValueError(
-                    f"Data type {dtype} not supported by exchange" "protocol"
+                    f"Data type {dtype} not supported by exchange protocol"
                 )
 
         if kind not in _SUPPORTED_KINDS:
@@ -312,7 +312,7 @@ def describe_null(self) -> Tuple[int, Any]:
 
         else:
             raise NotImplementedError(
-                f"Data type {self.dtype}" " not yet supported"
+                f"Data type {self.dtype} not yet supported"
             )
 
     @property
@@ -414,12 +414,14 @@ def _get_data_buffer(
 
         else:
             raise NotImplementedError(
-                f"Data type {self._col.dtype}" " not handled yet"
+                f"Data type {self._col.dtype} not handled yet"
             )
 
         return buffer, dtype
 
-    def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
+    def _get_validity_buffer(
+        self,
+    ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]:
         """
         Return the buffer containing the mask values
         indicating missing data and the buffer's associated dtype.
@@ -452,14 +454,16 @@ def _get_validity_buffer(self) -> Tuple[_CuDFBuffer, Any]:
             )
         elif null == 0:
             raise RuntimeError(
-                "This column is non-nullable" " so does not have a mask"
+                "This column is non-nullable so does not have a mask"
             )
         else:
             raise NotImplementedError(
-                f"See {self.__class__.__name__}" ".describe_null method."
+                f"See {self.__class__.__name__}.describe_null method."
             )
 
-    def _get_offsets_buffer(self) -> Tuple[_CuDFBuffer, Any]:
+    def _get_offsets_buffer(
+        self,
+    ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]:
         """
         Return the buffer containing the offset values for
         variable-size binary data (e.g., variable-length strings)
@@ -661,7 +665,7 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
 
         else:
             raise NotImplementedError(
-                f"Data type {col.dtype[0]}" " not handled yet"
+                f"Data type {col.dtype[0]} not handled yet"
             )
 
         _buffers.append(_buf)
@@ -736,7 +740,7 @@ def _protocol_to_cudf_column_categorical(
     ordered, is_dict, mapping = col.describe_categorical
     if not is_dict:
         raise NotImplementedError(
-            "Non-dictionary categoricals" " not supported yet"
+            "Non-dictionary categoricals not supported yet"
         )
 
     categories = as_column(mapping.values())

From 5cffc2f9a1bfd6356173cf2f82f36bd87860c2b7 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 3 Nov 2021 10:25:58 +0000
Subject: [PATCH 45/60] remove incorrect comment.

---
 python/cudf/cudf/core/df_protocol.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 458bdb962bb..71910d241c0 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -585,8 +585,6 @@ def __dataframe__(
     allowed to make a copy of the data. For example, copying data would be
     necessary if a library supports strided buffers, given that this protocol
     specifies contiguous buffers.
-    Currently, if this flag is set to ``False`` and a copy is needed, a
-    ``RuntimeError`` will be raised.
     """
     return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy)
 

From 5f27e66f725bb467d14546befe3dc7955af3c1f4 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 3 Nov 2021 17:49:32 +0000
Subject: [PATCH 46/60] fix xome mypy check errors.

---
 python/cudf/cudf/core/df_protocol.py       | 88 +++++++++++++---------
 python/cudf/cudf/tests/test_df_protocol.py |  2 +-
 2 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 71910d241c0..9e2b8b33730 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,10 +1,19 @@
 import collections
 import enum
-from typing import Any, Dict, Iterable, Optional, Sequence, Tuple
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    Mapping
+)
 
 import cupy as cp
 import numpy as np
-from numba import cuda
+from numba.cuda import as_cuda_array
 
 import cudf
 from cudf.core.buffer import Buffer
@@ -44,6 +53,7 @@ class _Device(enum.IntEnum):
     _k.BOOL,
     _k.STRING,
 )
+ProtoDtype = Tuple[_DtypeKind, int, str, str]
 
 
 class _CuDFBuffer:
@@ -85,7 +95,7 @@ def __dlpack__(self):
         DLPack not implemented in NumPy yet, so leave it out here.
         """
         try:
-            cudarray = cuda.as_cuda_array(self._buf).view(self._dtype)
+            cudarray = as_cuda_array(self._buf).view(self._dtype)
             res = cp.asarray(cudarray).toDlpack()
 
         except ValueError:
@@ -136,13 +146,12 @@ def __init__(
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
         """
-        if not isinstance(column, cudf.Series):
-            raise NotImplementedError(
-                "Columns of type {} not handled yet".format(type(column))
+        if not isinstance(column, cudf.core.column.ColumnBase):
+            raise TypeError(
+                "column must be a subtype of df.core.column.ColumnBase,"
+                f"got {type(column)}"
             )
-
-        # Store the column as a private attribute
-        self._col = as_column(column)
+        self._col = column
         self._nan_as_null = nan_as_null
         self._allow_copy = allow_copy
 
@@ -161,7 +170,7 @@ def offset(self) -> int:
         return 0
 
     @property
-    def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
+    def dtype(self) -> ProtoDtype:
         """
         Dtype description as a tuple
         ``(kind, bit-width, format string, endianness)``
@@ -210,7 +219,7 @@ def dtype(self) -> Tuple[_DtypeKind, int, str, str]:
 
         return self._dtype_from_cudfdtype(dtype)
 
-    def _dtype_from_cudfdtype(self, dtype) -> Tuple[_DtypeKind, int, str, str]:
+    def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
         """
         See `self.dtype` for details.
         """
@@ -345,7 +354,7 @@ def get_chunks(
         """
         return (self,)
 
-    def get_buffers(self) -> Dict[str, _CuDFBuffer]:
+    def get_buffers(self) -> Mapping[str, Union[Tuple[_CuDFBuffer, ProtoDtype], None]]:
         """
         Return a dictionary containing the underlying buffers.
 
@@ -382,7 +391,7 @@ def get_buffers(self) -> Dict[str, _CuDFBuffer]:
 
     def _get_data_buffer(
         self,
-    ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]:
+    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the data and
                the buffer's associated dtype.
@@ -421,7 +430,7 @@ def _get_data_buffer(
 
     def _get_validity_buffer(
         self,
-    ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]:
+    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the mask values
         indicating missing data and the buffer's associated dtype.
@@ -463,7 +472,7 @@ def _get_validity_buffer(
 
     def _get_offsets_buffer(
         self,
-    ) -> Tuple[_CuDFBuffer, Tuple[_DtypeKind, int, str, str]]:
+    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the offset values for
         variable-size binary data (e.g., variable-length strings)
@@ -536,14 +545,17 @@ def column_names(self) -> Iterable[str]:
         return self._df.columns.tolist()
 
     def get_column(self, i: int) -> _CuDFColumn:
-        return _CuDFColumn(self._df.iloc[:, i], allow_copy=self._allow_copy)
+        return _CuDFColumn(as_column(self._df.iloc[:, i]),
+                           allow_copy=self._allow_copy)
 
     def get_column_by_name(self, name: str) -> _CuDFColumn:
-        return _CuDFColumn(self._df[name], allow_copy=self._allow_copy)
+        return _CuDFColumn(as_column(self._df[name]),
+                           allow_copy=self._allow_copy)
 
     def get_columns(self) -> Iterable[_CuDFColumn]:
         return [
-            _CuDFColumn(self._df[name], allow_copy=self._allow_copy)
+            _CuDFColumn(as_column(self._df[name]),
+                        allow_copy=self._allow_copy)
             for name in self._df.columns
         ]
 
@@ -674,22 +686,24 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
 
 
 def _protocol_to_cudf_column_numeric(
-    col: ColumnObject,
-) -> Tuple[cudf.core.column.NumericalColumn, _CuDFBuffer]:
+    col: _CuDFColumn,
+) -> Tuple[cudf.core.column.NumericalColumn, 
+           Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]:
     """
     Convert an int, uint, float or bool protocol column
     to the corresponding cudf column
     """
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
-
-    _dbuffer, _ddtype = col.get_buffers()["data"]
+    
+    buffers = col.get_buffers()
+    _dbuffer, _ddtype = buffers["data"]
     _check_buffer_is_on_gpu(_dbuffer)
     dcol = build_column(
         Buffer(_dbuffer.ptr, _dbuffer.bufsize),
         protocol_dtype_to_cupy_dtype(_ddtype),
     )
-    return _set_missing_values(col, dcol), _dbuffer
+    return _set_missing_values(col, dcol), buffers
 
 
 def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
@@ -704,23 +718,20 @@ def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
 
 
 def _set_missing_values(
-    protocol_col: _CuDFColumn, cudf_col: "cudf.core.dataframe.DataFrame"
+    protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase
 ) -> cudf.core.column.ColumnBase:
 
-    null_kind, null_value = protocol_col.describe_null
-    if null_kind != 0:
-        assert null_kind == 3, "cudf supports only bit mask, "
-        f"null_kind should be 3, got: {null_kind}."
-        _mask_buffer, _mask_dtype = protocol_col.get_buffers()["validity"]
+    valid_mask = protocol_col.get_buffers()["validity"]
+    if valid_mask is not None:
         bitmask = cp.asarray(
-            Buffer(_mask_buffer.ptr, _mask_buffer.bufsize), cp.bool8
+            Buffer(valid_mask[0].ptr, valid_mask[0].bufsize), cp.bool8
         )
         cudf_col[~bitmask] = None
 
     return cudf_col
 
 
-def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype:
+def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:
     kind = _dtype[0]
     bitwidth = _dtype[1]
     if _dtype[0] not in _SUPPORTED_KINDS:
@@ -730,8 +741,9 @@ def protocol_dtype_to_cupy_dtype(_dtype) -> cp.dtype:
 
 
 def _protocol_to_cudf_column_categorical(
-    col: ColumnObject,
-) -> Tuple[cudf.core.column.CategoricalColumn, _CuDFBuffer]:
+    col: _CuDFColumn,
+) -> Tuple[cudf.core.column.CategoricalColumn, 
+           Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]:
     """
     Convert a categorical column to a Series instance
     """
@@ -742,7 +754,8 @@ def _protocol_to_cudf_column_categorical(
         )
 
     categories = as_column(mapping.values())
-    codes_buffer, codes_dtype = col.get_buffers()["data"]
+    buffers = col.get_buffers()
+    codes_buffer, codes_dtype = buffers["data"]
     _check_buffer_is_on_gpu(codes_buffer)
     cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
     codes = build_column(
@@ -757,12 +770,13 @@ def _protocol_to_cudf_column_categorical(
         ordered=ordered,
     )
 
-    return _set_missing_values(col, cudfcol), codes_buffer
+    return _set_missing_values(col, cudfcol), buffers
 
 
 def _protocol_to_cudf_column_string(
-    col: ColumnObject,
-) -> Tuple[cudf.core.column.StringColumn, Tuple[_CuDFBuffer]]:
+    col: _CuDFColumn,
+) -> Tuple[cudf.core.column.StringColumn, 
+           Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]:
     """
     Convert a string ColumnObject to cudf Column object.
     """
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index fb96c6b3698..b97ea950cee 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -26,7 +26,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     col_from_buf = build_column(
         Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype)
     )
-    # check that non null values are the equals as null are represented
+    # check that non null values are the equals as nulls are represented
     # by sentinel values in the buffer.
     non_null_idxs = cudf.Series(cudfcol) != cudf.NA
     assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])

From ac1ceb846d15e9ef9012fadac36669405ef614ce Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 9 Nov 2021 21:34:46 +0000
Subject: [PATCH 47/60] fix mypy errors

---
 python/cudf/cudf/core/df_protocol.py | 140 +++++++++++++++------------
 1 file changed, 79 insertions(+), 61 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 9e2b8b33730..d8eb9d09dd3 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -4,11 +4,11 @@
     Any,
     Dict,
     Iterable,
+    Mapping,
     Optional,
     Sequence,
     Tuple,
-    Union,
-    Mapping
+    cast,
 )
 
 import cupy as cp
@@ -244,7 +244,10 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
                 kind = _k.CATEGORICAL
                 # Codes and categories' dtypes are different.
                 # We use codes' dtype as these are stored in the buffer.
-                dtype = self._col.codes.dtype
+                codes = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes
+                dtype = codes.dtype
             else:
                 raise ValueError(
                     f"Data type {dtype} not supported by exchange protocol"
@@ -282,12 +285,12 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
                 "`describe_categorical only works on "
                 "a column with categorical dtype!"
             )
-
-        ordered = self._col.dtype.ordered
+        categ_col = cast(cudf.core.column.CategoricalColumn, self._col)
+        ordered = bool(categ_col.dtype.ordered)
         is_dictionary = True
         # NOTE: this shows the children approach is better, transforming
         # `categories` to a "mapping" dict is inefficient
-        categories = self._col.categories
+        categories = categ_col.categories
         mapping = {ix: val for ix, val in enumerate(categories.values_host)}
         return ordered, is_dictionary, mapping
 
@@ -354,7 +357,9 @@ def get_chunks(
         """
         return (self,)
 
-    def get_buffers(self) -> Mapping[str, Union[Tuple[_CuDFBuffer, ProtoDtype], None]]:
+    def get_buffers(
+        self,
+    ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]:
         """
         Return a dictionary containing the underlying buffers.
 
@@ -389,48 +394,39 @@ def get_buffers(self) -> Mapping[str, Union[Tuple[_CuDFBuffer, ProtoDtype], None
 
         return buffers
 
-    def _get_data_buffer(
-        self,
-    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
+    def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the data and
                the buffer's associated dtype.
         """
         _k = _DtypeKind
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            buffer = _CuDFBuffer(
-                self._col.data, self._col.dtype, allow_copy=self._allow_copy
-            )
+            col_data = self._col
             dtype = self.dtype
 
         elif self.dtype[0] == _k.CATEGORICAL:
-            codes = self._col.codes
-            buffer = _CuDFBuffer(
-                self._col.codes.data,
-                self._col.codes.dtype,
-                allow_copy=self._allow_copy,
-            )
-            dtype = self._dtype_from_cudfdtype(codes.dtype)
+            col_data = cast(
+                cudf.core.column.CategoricalColumn, self._col
+            ).codes
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
 
         elif self.dtype[0] == _k.STRING:
-            encoded_string = self._col.children[1]
-            buffer = _CuDFBuffer(
-                encoded_string.data,
-                encoded_string.dtype,
-                allow_copy=self._allow_copy,
-            )
-            dtype = self._dtype_from_cudfdtype(encoded_string.dtype)
+            col_data = self._col.children[1]
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
 
         else:
             raise NotImplementedError(
                 f"Data type {self._col.dtype} not handled yet"
             )
+        assert (col_data is not None) and (col_data.data is not None), " "
+        f"col_data(.data) should not be None when dtype = {dtype}"
+        buffer = _CuDFBuffer(
+            col_data.data, col_data.dtype, allow_copy=self._allow_copy
+        )
 
         return buffer, dtype
 
-    def _get_validity_buffer(
-        self,
-    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
+    def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the mask values
         indicating missing data and the buffer's associated dtype.
@@ -442,17 +438,22 @@ def _get_validity_buffer(
         if null == 3:
             _k = _DtypeKind
             if self.dtype[0] == _k.CATEGORICAL:
-                buffer = _CuDFBuffer(
-                    self._col.codes._get_mask_as_column().data,
-                    cp.uint8,
-                    allow_copy=self._allow_copy,
-                )
+                valid_mask = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes._get_mask_as_column()
             else:
-                buffer = _CuDFBuffer(
-                    self._col._get_mask_as_column().data,
-                    cp.uint8,
-                    allow_copy=self._allow_copy,
-                )
+                valid_mask = self._col._get_mask_as_column()
+
+            # if (valid_mask is None) or (valid_mask.data is None) :
+            #     raise RuntimeError("valid_mask and valid_mask.data"
+            #     " should not be None when _CuDFColumn.describe_null[0] = 3")
+            assert (valid_mask is not None) and (
+                valid_mask.data is not None
+            ), "valid_mask(.data) should not be None when "
+            "_CuDFColumn.describe_null[0] = 3"
+            buffer = _CuDFBuffer(
+                valid_mask.data, cp.uint8, allow_copy=self._allow_copy
+            )
             dtype = (_k.UINT, 8, "C", "=")
             return buffer, dtype
 
@@ -470,9 +471,7 @@ def _get_validity_buffer(
                 f"See {self.__class__.__name__}.describe_null method."
             )
 
-    def _get_offsets_buffer(
-        self,
-    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
+    def _get_offsets_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the offset values for
         variable-size binary data (e.g., variable-length strings)
@@ -484,6 +483,9 @@ def _get_offsets_buffer(
         _k = _DtypeKind
         if self.dtype[0] == _k.STRING:
             offsets = self._col.children[0]
+            assert (offsets is not None) and (offsets.data is not None), " "
+            "offsets(.data) should not be None for string column"
+
             buffer = _CuDFBuffer(
                 offsets.data, offsets.dtype, allow_copy=self._allow_copy
             )
@@ -545,17 +547,18 @@ def column_names(self) -> Iterable[str]:
         return self._df.columns.tolist()
 
     def get_column(self, i: int) -> _CuDFColumn:
-        return _CuDFColumn(as_column(self._df.iloc[:, i]),
-                           allow_copy=self._allow_copy)
+        return _CuDFColumn(
+            as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy
+        )
 
     def get_column_by_name(self, name: str) -> _CuDFColumn:
-        return _CuDFColumn(as_column(self._df[name]),
-                           allow_copy=self._allow_copy)
+        return _CuDFColumn(
+            as_column(self._df[name]), allow_copy=self._allow_copy
+        )
 
     def get_columns(self) -> Iterable[_CuDFColumn]:
         return [
-            _CuDFColumn(as_column(self._df[name]),
-                        allow_copy=self._allow_copy)
+            _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy)
             for name in self._df.columns
         ]
 
@@ -687,23 +690,26 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
 
 def _protocol_to_cudf_column_numeric(
     col: _CuDFColumn,
-) -> Tuple[cudf.core.column.NumericalColumn, 
-           Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]:
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
     """
     Convert an int, uint, float or bool protocol column
     to the corresponding cudf column
     """
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
-    
+
     buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
     _dbuffer, _ddtype = buffers["data"]
     _check_buffer_is_on_gpu(_dbuffer)
-    dcol = build_column(
+    cudfcol_num = build_column(
         Buffer(_dbuffer.ptr, _dbuffer.bufsize),
         protocol_dtype_to_cupy_dtype(_ddtype),
     )
-    return _set_missing_values(col, dcol), buffers
+    return _set_missing_values(col, cudfcol_num), buffers
 
 
 def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
@@ -716,6 +722,12 @@ def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
             "Set `allow_copy=True` to allow it."
         )
 
+    elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy:
+        raise NotImplementedError(
+            "Only cuDF/GPU dataframes are supported for now."
+            "CPU (like `Pandas`) dataframes will be supported shortly."
+        )
+
 
 def _set_missing_values(
     protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase
@@ -742,8 +754,10 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:
 
 def _protocol_to_cudf_column_categorical(
     col: _CuDFColumn,
-) -> Tuple[cudf.core.column.CategoricalColumn, 
-           Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]:
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
     """
     Convert a categorical column to a Series instance
     """
@@ -755,6 +769,7 @@ def _protocol_to_cudf_column_categorical(
 
     categories = as_column(mapping.values())
     buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
     codes_buffer, codes_dtype = buffers["data"]
     _check_buffer_is_on_gpu(codes_buffer)
     cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
@@ -775,8 +790,10 @@ def _protocol_to_cudf_column_categorical(
 
 def _protocol_to_cudf_column_string(
     col: _CuDFColumn,
-) -> Tuple[cudf.core.column.StringColumn, 
-           Dict[str, Tuple[_CuDFBuffer, ProtoDtype]]]:
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
     """
     Convert a string ColumnObject to cudf Column object.
     """
@@ -784,6 +801,7 @@ def _protocol_to_cudf_column_string(
     buffers = col.get_buffers()
 
     # Retrieve the data buffer containing the UTF-8 code units
+    assert buffers["data"] is not None, "data buffer should never be None"
     data_buffer, data_dtype = buffers["data"]
     _check_buffer_is_on_gpu(data_buffer)
     encoded_string = build_column(
@@ -793,6 +811,7 @@ def _protocol_to_cudf_column_string(
 
     # Retrieve the offsets buffer containing the index offsets demarcating
     # the beginning and end of each string
+    assert buffers["offsets"] is not None, "not possible for string column"
     offset_buffer, offset_dtype = buffers["offsets"]
     _check_buffer_is_on_gpu(offset_buffer)
     offsets = build_column(
@@ -800,8 +819,7 @@ def _protocol_to_cudf_column_string(
         protocol_dtype_to_cupy_dtype(offset_dtype),
     )
 
-    col_str = build_column(
+    cudfcol_str = build_column(
         None, dtype=cp.dtype("O"), children=(offsets, encoded_string)
     )
-
-    return _set_missing_values(col, col_str), buffers
+    return _set_missing_values(col, cudfcol_str), buffers

From e274ea27574f29f9af6258cc676e17dc8ce2545e Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 9 Nov 2021 23:33:19 +0000
Subject: [PATCH 48/60] fix last  mypy errors

---
 python/cudf/cudf/core/df_protocol.py | 76 ++++++++++++++--------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index d8eb9d09dd3..0ec66d7fa12 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -10,7 +10,7 @@
     Tuple,
     cast,
 )
-
+import warnings
 import cupy as cp
 import numpy as np
 from numba.cuda import as_cuda_array
@@ -381,7 +381,6 @@ def get_buffers(
                          buffer.
         """
         buffers = {}
-        buffers["data"] = self._get_data_buffer()
         try:
             buffers["validity"] = self._get_validity_buffer()
         except RuntimeError:
@@ -392,41 +391,11 @@ def get_buffers(
         except RuntimeError:
             buffers["offsets"] = None
 
-        return buffers
-
-    def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
-        """
-        Return the buffer containing the data and
-               the buffer's associated dtype.
-        """
-        _k = _DtypeKind
-        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
-            col_data = self._col
-            dtype = self.dtype
-
-        elif self.dtype[0] == _k.CATEGORICAL:
-            col_data = cast(
-                cudf.core.column.CategoricalColumn, self._col
-            ).codes
-            dtype = self._dtype_from_cudfdtype(col_data.dtype)
-
-        elif self.dtype[0] == _k.STRING:
-            col_data = self._col.children[1]
-            dtype = self._dtype_from_cudfdtype(col_data.dtype)
-
-        else:
-            raise NotImplementedError(
-                f"Data type {self._col.dtype} not handled yet"
-            )
-        assert (col_data is not None) and (col_data.data is not None), " "
-        f"col_data(.data) should not be None when dtype = {dtype}"
-        buffer = _CuDFBuffer(
-            col_data.data, col_data.dtype, allow_copy=self._allow_copy
-        )
+        buffers["data"] = self._get_data_buffer()
 
-        return buffer, dtype
+        return buffers
 
-    def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
+    def _get_validity_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
         """
         Return the buffer containing the mask values
         indicating missing data and the buffer's associated dtype.
@@ -444,9 +413,6 @@ def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
             else:
                 valid_mask = self._col._get_mask_as_column()
 
-            # if (valid_mask is None) or (valid_mask.data is None) :
-            #     raise RuntimeError("valid_mask and valid_mask.data"
-            #     " should not be None when _CuDFColumn.describe_null[0] = 3")
             assert (valid_mask is not None) and (
                 valid_mask.data is not None
             ), "valid_mask(.data) should not be None when "
@@ -471,7 +437,7 @@ def _get_validity_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
                 f"See {self.__class__.__name__}.describe_null method."
             )
 
-    def _get_offsets_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
+    def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
         """
         Return the buffer containing the offset values for
         variable-size binary data (e.g., variable-length strings)
@@ -498,6 +464,38 @@ def _get_offsets_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
 
         return buffer, dtype
 
+    def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
+        """
+        Return the buffer containing the data and
+               the buffer's associated dtype.
+        """
+        _k = _DtypeKind
+        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            col_data = self._col
+            dtype = self.dtype
+
+        elif self.dtype[0] == _k.CATEGORICAL:
+            col_data = cast(
+                cudf.core.column.CategoricalColumn, self._col
+            ).codes
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        elif self.dtype[0] == _k.STRING:
+            col_data = self._col.children[1]
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self._col.dtype} not handled yet"
+            )
+        assert (col_data is not None) and (col_data.data is not None), " "
+        f"col_data(.data) should not be None when dtype = {dtype}"
+        buffer = _CuDFBuffer(
+            col_data.data, col_data.dtype, allow_copy=self._allow_copy
+        )
+
+        return buffer, dtype
+
 
 class _CuDFDataFrame:
     """

From 7aa325ed23b64f33d7cc85d22daafd4bea4dba34 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 9 Nov 2021 23:41:36 +0000
Subject: [PATCH 49/60] rerun black,isort, flake8

---
 python/cudf/cudf/core/df_protocol.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 0ec66d7fa12..966c6597f8b 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -10,7 +10,7 @@
     Tuple,
     cast,
 )
-import warnings
+
 import cupy as cp
 import numpy as np
 from numba.cuda import as_cuda_array
@@ -395,7 +395,9 @@ def get_buffers(
 
         return buffers
 
-    def _get_validity_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+    def _get_validity_buffer(
+        self,
+    ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
         """
         Return the buffer containing the mask values
         indicating missing data and the buffer's associated dtype.

From 80bf86a104007faf437e2e471f4650150b956102 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Tue, 9 Nov 2021 23:48:15 +0000
Subject: [PATCH 50/60] run isort on dataframe.py

---
 python/cudf/cudf/core/dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4988b422caf..c2c529320fc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -40,7 +40,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core import column, reshape, df_protocol
+from cudf.core import column, df_protocol, reshape
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     as_column,

From 9a3549c6ec020b492ac61c525d71339d0335ab96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:14:00 +0100
Subject: [PATCH 51/60] Remove _DTypeKind alias _k

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 966c6597f8b..1f64074e2d2 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -44,15 +44,14 @@ class _Device(enum.IntEnum):
     ROCM = 10
 
 
-_k = _DtypeKind
-_SUPPORTED_KINDS = (
-    _k.INT,
-    _k.UINT,
-    _k.FLOAT,
-    _k.CATEGORICAL,
-    _k.BOOL,
-    _k.STRING,
-)
+_SUPPORTED_KINDS = {
+    _DtypeKind.INT,
+    _DtypeKind.UINT,
+    _DtypeKind.FLOAT,
+    _DtypeKind.CATEGORICAL,
+    _DtypeKind.BOOL,
+    _DtypeKind.STRING,
+}
 ProtoDtype = Tuple[_DtypeKind, int, str, str]
 
 

From b76d419f466f51cf0059e6b7ecc7a26a878ebd0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:15:32 +0100
Subject: [PATCH 52/60] Remove _DTypeKind alias _k

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 1f64074e2d2..41cbd748cee 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -226,15 +226,14 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
         #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
         #       not handled datetime and timedelta both map to datetime
         #       (is timedelta handled?)
-        _k = _DtypeKind
         _np_kinds = {
-            "i": _k.INT,
-            "u": _k.UINT,
-            "f": _k.FLOAT,
-            "b": _k.BOOL,
-            "U": _k.STRING,
-            "M": _k.DATETIME,
-            "m": _k.DATETIME,
+            "i": _DtypeKind.INT,
+            "u": _DtypeKind.UINT,
+            "f": _DtypeKind.FLOAT,
+            "b": _DtypeKind.BOOL,
+            "U": _DtypeKind.STRING,
+            "M": _DtypeKind.DATETIME,
+            "m": _DtypeKind.DATETIME,
         }
         kind = _np_kinds.get(dtype.kind, None)
         if kind is None:

From a34f1186138ea379c8aa74460129ce2746763ed6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:16:04 +0100
Subject: [PATCH 53/60] Remove _DTypeKind alias _k

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 41cbd748cee..4f1e88f14b0 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -256,7 +256,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
 
         bitwidth = dtype.itemsize * 8
         format_str = dtype.str
-        endianness = dtype.byteorder if kind != _k.CATEGORICAL else "="
+        endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "="
         return (kind, bitwidth, format_str, endianness)
 
     @property

From 99ca31d972c4b1a040fc31a81ee69e2f3d4eaf07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:16:32 +0100
Subject: [PATCH 54/60] Remove _DTypeKind alias _k

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 4f1e88f14b0..19dbef46eb0 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -405,8 +405,7 @@ def _get_validity_buffer(
 
         null, invalid = self.describe_null
         if null == 3:
-            _k = _DtypeKind
-            if self.dtype[0] == _k.CATEGORICAL:
+            if self.dtype[0] == _DtypeKind.CATEGORICAL:
                 valid_mask = cast(
                     cudf.core.column.CategoricalColumn, self._col
                 ).codes._get_mask_as_column()

From 87bba3236593b4fe9e51b5ba898a6f33b7f46ffa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:16:59 +0100
Subject: [PATCH 55/60] Remove _DTypeKind alias _k

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 19dbef46eb0..520944a085d 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -419,7 +419,7 @@ def _get_validity_buffer(
             buffer = _CuDFBuffer(
                 valid_mask.data, cp.uint8, allow_copy=self._allow_copy
             )
-            dtype = (_k.UINT, 8, "C", "=")
+            dtype = (_DtypeKind.UINT, 8, "C", "=")
             return buffer, dtype
 
         elif null == 1:

From 535e56e37848307c78499eaec217140e6a591600 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:29:03 +0100
Subject: [PATCH 56/60] add space to multi-line comment.

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 520944a085d..bb9e69b8c4f 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -721,7 +721,7 @@ def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
 
     elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy:
         raise NotImplementedError(
-            "Only cuDF/GPU dataframes are supported for now."
+            "Only cuDF/GPU dataframes are supported for now. "
             "CPU (like `Pandas`) dataframes will be supported shortly."
         )
 

From b421a2954cfc157f3ca48265168327b469257d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Wed, 10 Nov 2021 07:29:54 +0100
Subject: [PATCH 57/60] Remove _DTypeKind alias _k

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/df_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index bb9e69b8c4f..26b69ee70f0 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -239,7 +239,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
         if kind is None:
             # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
             if isinstance(dtype, cudf.CategoricalDtype):
-                kind = _k.CATEGORICAL
+                kind = _DtypeKind.CATEGORICAL
                 # Codes and categories' dtypes are different.
                 # We use codes' dtype as these are stored in the buffer.
                 codes = cast(

From 6ae5ee0f0b00170df015db5b706eb954a9ecb4d0 Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 10 Nov 2021 11:27:37 +0000
Subject: [PATCH 58/60] remove remaining _DtypeKind aliases

---
 python/cudf/cudf/core/df_protocol.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 26b69ee70f0..8f258ce27b2 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -445,8 +445,7 @@ def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
         Raises RuntimeError if the data buffer does not have an associated
         offsets buffer.
         """
-        _k = _DtypeKind
-        if self.dtype[0] == _k.STRING:
+        if self.dtype[0] == _DtypeKind.STRING:
             offsets = self._col.children[0]
             assert (offsets is not None) and (offsets.data is not None), " "
             "offsets(.data) should not be None for string column"
@@ -468,18 +467,22 @@ def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
         Return the buffer containing the data and
                the buffer's associated dtype.
         """
-        _k = _DtypeKind
-        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+        if self.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
             col_data = self._col
             dtype = self.dtype
 
-        elif self.dtype[0] == _k.CATEGORICAL:
+        elif self.dtype[0] == _DtypeKind.CATEGORICAL:
             col_data = cast(
                 cudf.core.column.CategoricalColumn, self._col
             ).codes
             dtype = self._dtype_from_cudfdtype(col_data.dtype)
 
-        elif self.dtype[0] == _k.STRING:
+        elif self.dtype[0] == _DtypeKind.STRING:
             col_data = self._col.children[1]
             dtype = self._dtype_from_cudfdtype(col_data.dtype)
 
@@ -659,18 +662,22 @@ def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
 
     # We need a dict of columns here, with each column being a cudf column.
     columns = dict()
-    _k = _DtypeKind
     _buffers = []  # hold on to buffers, keeps memory alive
     for name in df.column_names():
         col = df.get_column_by_name(name)
 
-        if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+        if col.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
             columns[name], _buf = _protocol_to_cudf_column_numeric(col)
 
-        elif col.dtype[0] == _k.CATEGORICAL:
+        elif col.dtype[0] == _DtypeKind.CATEGORICAL:
             columns[name], _buf = _protocol_to_cudf_column_categorical(col)
 
-        elif col.dtype[0] == _k.STRING:
+        elif col.dtype[0] == _DtypeKind.STRING:
             columns[name], _buf = _protocol_to_cudf_column_string(col)
 
         else:

From 581153fc413ef18dad14072ef6131c52bf17772d Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 10 Nov 2021 11:31:37 +0000
Subject: [PATCH 59/60] import DataFrameObject from df_protocol

---
 python/cudf/cudf/tests/test_df_protocol.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index b97ea950cee..4408cafa80f 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -8,6 +8,7 @@
 from cudf.core.buffer import Buffer
 from cudf.core.column import build_column
 from cudf.core.df_protocol import (
+    DataFrameObject,
     _CuDFBuffer,
     _CuDFColumn,
     _DtypeKind,
@@ -16,8 +17,6 @@
 )
 from cudf.testing._utils import assert_eq
 
-DataFrameObject = Any
-
 
 def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     buf, dtype = buffer_and_dtype

From c1231cbc3ae62128167ab2ee9da462bd5b0b2dba Mon Sep 17 00:00:00 2001
From: iskode <ismael.from.kone@gmail.com>
Date: Wed, 10 Nov 2021 11:58:56 +0000
Subject: [PATCH 60/60] rename assertion methods

---
 python/cudf/cudf/tests/test_df_protocol.py | 40 +++++++++++-----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 4408cafa80f..d24c8ca2860 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -79,7 +79,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
         assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
 
 
-def _test_from_dataframe_equals(dfobj):
+def assert_from_dataframe_equals(dfobj):
     df2 = _from_dataframe(dfobj)
 
     assert_dataframe_equal(dfobj, df2)
@@ -93,17 +93,17 @@ def _test_from_dataframe_equals(dfobj):
         raise TypeError(f"{type(dfobj._df)} not supported yet.")
 
 
-def _test_from_dataframe_exception(dfobj):
+def assert_from_dataframe_exception(dfobj):
     exception_msg = "This operation must copy data from CPU to GPU."
     " Set `allow_copy=True` to allow it."
     with pytest.raises(TypeError, match=exception_msg):
         _from_dataframe(dfobj)
 
 
-def _test_datatype(data):
+def assert_df_unique_dtype_cols(data):
     cdf = cudf.DataFrame(data=data)
-    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
-    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
 
 
 def test_from_dataframe():
@@ -115,12 +115,12 @@ def test_from_dataframe():
 
 def test_int_dtype():
     data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
-    _test_datatype(data_int)
+    assert_df_unique_dtype_cols(data_int)
 
 
 def test_float_dtype():
     data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
-    _test_datatype(data_float)
+    assert_df_unique_dtype_cols(data_float)
 
 
 def test_categorical_dtype():
@@ -129,18 +129,18 @@ def test_categorical_dtype():
     col = cdf.__dataframe__().get_column_by_name("A")
     assert col.dtype[0] == _DtypeKind.CATEGORICAL
     assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
-    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
-    _test_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
 
 
 def test_bool_dtype():
     data_bool = dict(a=[True, True, False], b=[False, True, False])
-    _test_datatype(data_bool)
+    assert_df_unique_dtype_cols(data_bool)
 
 
 def test_string_dtype():
     data_string = dict(a=["a", "b", "cdef", "", "g"])
-    _test_datatype(data_string)
+    assert_df_unique_dtype_cols(data_string)
 
 
 def test_mixed_dtype():
@@ -151,7 +151,7 @@ def test_mixed_dtype():
         categorical=[5, 1, 5],
         string=["rapidsai-cudf ", "", "df protocol"],
     )
-    _test_datatype(data_mixed)
+    assert_df_unique_dtype_cols(data_mixed)
 
 
 def test_NA_int_dtype():
@@ -160,7 +160,7 @@ def test_NA_int_dtype():
         b=[9, 10, None, 7, 8],
         c=[6, 19, 20, 100, 1000],
     )
-    _test_datatype(data_int)
+    assert_df_unique_dtype_cols(data_int)
 
 
 def test_NA_float_dtype():
@@ -169,7 +169,7 @@ def test_NA_float_dtype():
         b=[9.7, 10.9, None, 7.8, 8.2],
         c=[6.1, 19.2, 20.3, 100.4, 1000.5],
     )
-    _test_datatype(data_float)
+    assert_df_unique_dtype_cols(data_float)
 
 
 def test_NA_categorical_dtype():
@@ -184,13 +184,13 @@ def test_NA_categorical_dtype():
     assert col.describe_null == (3, 0)
     assert col.num_chunks() == 1
     assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
-    _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
-    _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
 
 
 def test_NA_bool_dtype():
     data_bool = dict(a=[None, True, False], b=[False, None, None])
-    _test_datatype(data_bool)
+    assert_df_unique_dtype_cols(data_bool)
 
 
 def test_NA_string_dtype():
@@ -204,8 +204,8 @@ def test_NA_string_dtype():
     assert col.null_count == 1
     assert col.describe_null == (3, 0)
     assert col.num_chunks() == 1
-    _test_from_dataframe_equals(df.__dataframe__(allow_copy=False))
-    _test_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
 
 
 def test_NA_mixed_dtype():
@@ -216,4 +216,4 @@ def test_NA_mixed_dtype():
         categorical=[5, 1, 5, 3, None],
         string=[None, None, None, "df protocol", None],
     )
-    _test_datatype(data_mixed)
+    assert_df_unique_dtype_cols(data_mixed)