Adding Interval Dtype (#6984)

This PR is intended to add the Interval Dtype to CuDF. Interval is an ExtensionDtype for Interval data. This PR closes #5376. Authors: - Marlene (@marlenezw) Approvers: - Keith Kraus (@kkraus14) URL: #6984
rapidsai · Feb 19, 2021 · 0d03d9f · 0d03d9f
1 parent 593dc1c
commit 0d03d9f
Show file tree

Hide file tree

Showing 9 changed files with 354 additions and 7 deletions.
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
@@ -21,4 +21,5 @@
 from cudf.core.column.string import StringColumn  # noqa: F401
 from cudf.core.column.struct import StructColumn  # noqa: F401
 from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
+from cudf.core.column.interval import IntervalColumn  # noqa: F401
 from cudf.core.column.decimal import DecimalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -54,6 +54,7 @@
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
+    is_interval_dtype,
     min_signed_type,
     min_unsigned_type,
     np_to_pa_dtype,
@@ -117,6 +118,10 @@ def to_pandas(
             pd_series = pd.Series(pandas_array, copy=False)
         elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
             pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
+        elif is_interval_dtype(self.dtype):
+            pd_series = pd.Series(
+                pd.IntervalDtype().__from_arrow__(self.to_arrow())
+            )
         else:
             pd_series = self.to_arrow().to_pandas(**kwargs)
 
@@ -370,7 +375,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         if not isinstance(array, (pa.Array, pa.ChunkedArray)):
             raise TypeError("array should be PyArrow array or chunked array")
-
         data = pa.table([array], [None])
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
@@ -406,6 +410,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             )
         elif isinstance(array.type, pa.StructType):
             return cudf.core.column.StructColumn.from_arrow(array)
+        elif isinstance(
+            array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
+        ):
+            return cudf.core.column.IntervalColumn.from_arrow(array)
 
         return libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
@@ -1001,6 +1009,12 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
                     "Casting list columns not currently supported"
                 )
             return self
+        elif is_interval_dtype(self.dtype):
+            if not self.dtype == dtype:
+                raise NotImplementedError(
+                    "Casting interval columns not currently supported"
+                )
+            return self
         elif np.issubdtype(dtype, np.datetime64):
             return self.as_datetime_column(dtype, **kwargs)
         elif np.issubdtype(dtype, np.timedelta64):
@@ -1581,6 +1595,15 @@ def build_column(
             null_count=null_count,
             children=children,
         )
+    elif is_interval_dtype(dtype):
+        return cudf.core.column.IntervalColumn(
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
     else:
         assert data is not None
         return cudf.core.column.NumericalColumn(
@@ -1619,7 +1642,6 @@ def build_categorical_column(
     ordered : bool
         Indicates whether the categories are ordered
     """
-
     codes_dtype = min_unsigned_type(len(categories))
     codes = as_column(codes)
     if codes.dtype != codes_dtype:
@@ -1765,6 +1787,8 @@ def as_column(
             return as_column(arbitrary.array)
         if is_categorical_dtype(arbitrary):
             data = as_column(pa.array(arbitrary, from_pandas=True))
+        elif is_interval_dtype(arbitrary.dtype):
+            data = as_column(pa.array(arbitrary, from_pandas=True))
         elif arbitrary.dtype == np.bool_:
             data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
         elif arbitrary.dtype.kind in ("f"):
@@ -1886,6 +1910,18 @@ def as_column(
                 mask=mask,
                 dtype=arbitrary.dtype,
             )
+        elif (
+            arbitrary.size != 0
+            and arb_dtype.kind in ("O")
+            and isinstance(arbitrary[0], pd._libs.interval.Interval)
+        ):
+            # changing from pd array to series,possible arrow bug
+            interval_series = pd.Series(arbitrary)
+            data = as_column(
+                pa.Array.from_pandas(interval_series), dtype=arbitrary.dtype,
+            )
+            if dtype is not None:
+                data = data.astype(dtype)
         elif arb_dtype.kind in ("O", "U"):
             data = as_column(
                 pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
@@ -1916,7 +1952,17 @@ def as_column(
                 arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
                 if arb_dtype != arbitrary.dtype.numpy_dtype:
                     arbitrary = arbitrary.astype(arb_dtype)
-        if arb_dtype.kind in ("O", "U"):
+        if (
+            arbitrary.size != 0
+            and isinstance(arbitrary[0], pd._libs.interval.Interval)
+            and arb_dtype.kind in ("O")
+        ):
+            # changing from pd array to series,possible arrow bug
+            interval_series = pd.Series(arbitrary)
+            data = as_column(
+                pa.Array.from_pandas(interval_series), dtype=arb_dtype
+            )
+        elif arb_dtype.kind in ("O", "U"):
             data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
         else:
             data = as_column(
@@ -1971,7 +2017,7 @@ def as_column(
                         )
                         return cudf.core.column.DecimalColumn.from_arrow(data)
                     dtype = pd.api.types.pandas_dtype(dtype)
-                    if is_categorical_dtype(dtype):
+                    if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
                         raise TypeError
                     else:
                         np_type = np.dtype(dtype).type
@@ -1997,6 +2043,9 @@ def as_column(
                 elif np_type == np.str_:
                     sr = pd.Series(arbitrary, dtype="str")
                     data = as_column(sr, nan_as_null=nan_as_null)
+                elif is_interval_dtype(dtype):
+                    sr = pd.Series(arbitrary, dtype="interval")
+                    data = as_column(sr, nan_as_null=nan_as_null)
                 else:
                     data = as_column(
                         _construct_array(arbitrary, dtype),

diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+import pyarrow as pa
+import cudf
+from cudf.core.column import StructColumn
+
+
+class IntervalColumn(StructColumn):
+    def __init__(
+        self,
+        dtype,
+        mask=None,
+        size=None,
+        offset=0,
+        null_count=None,
+        children=(),
+        closed="right",
+    ):
+
+        super().__init__(
+            data=None,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+        if closed in ["left", "right", "neither", "both"]:
+            self._closed = closed
+        else:
+            raise ValueError("closed value is not valid")
+
+    @property
+    def closed(self):
+        return self._closed
+
+    @classmethod
+    def from_arrow(self, data):
+        new_col = super().from_arrow(data.storage)
+        size = len(data)
+        dtype = cudf.core.dtypes.IntervalDtype.from_arrow(data.type)
+        mask = data.buffers()[0]
+        if mask is not None:
+            mask = cudf.utils.utils.pa_mask_buffer_to_mask(mask, len(data))
+
+        offset = data.offset
+        null_count = data.null_count
+        children = new_col.children
+        closed = dtype.closed
+
+        return IntervalColumn(
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+            closed=closed,
+        )
+
+    def to_arrow(self):
+        typ = self.dtype.to_arrow()
+        return pa.ExtensionArray.from_storage(typ, super().to_arrow())
+
+    def from_struct_column(self, closed="right"):
+        return IntervalColumn(
+            size=self.size,
+            dtype=cudf.core.dtypes.IntervalDtype(
+                self.dtype.fields["left"], closed
+            ),
+            mask=self.base_mask,
+            offset=self.offset,
+            null_count=self.null_count,
+            children=self.base_children,
+            closed=closed,
+        )
+
+    def copy(self, deep=True):
+        closed = self.closed
+        struct_copy = super().copy(deep=deep)
+        return IntervalColumn(
+            size=struct_copy.size,
+            dtype=cudf.core.dtypes.IntervalDtype(
+                struct_copy.dtype.fields["left"], closed
+            ),
+            mask=struct_copy.base_mask,
+            offset=struct_copy.offset,
+            null_count=struct_copy.null_count,
+            children=struct_copy.base_children,
+            closed=closed,
+        )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -409,11 +409,14 @@ def _init_from_list_like(self, data, index=None, columns=None):
             index = as_index(index)
 
         self._index = as_index(index)
-
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
             self._data = data._data
+        # interval in a list
+        elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval):
+            data = DataFrame.from_pandas(pd.DataFrame(data))
+            self._data = data._data
         else:
             data = list(itertools.zip_longest(*data))
 

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 from pandas.api.extensions import ExtensionDtype
+from pandas.core.arrays._arrow_utils import ArrowIntervalType
 
 import cudf
 from cudf._typing import Dtype
@@ -219,7 +220,7 @@ def __eq__(self, other):
         return self._typ.equals(other._typ)
 
     def __repr__(self):
-        return f"StructDtype({self.fields})"
+        return f"{type(self).__name__}({self.fields})"
 
     def __hash__(self):
         return hash(self._typ)
@@ -304,3 +305,39 @@ def _validate(cls, precision, scale=0):
             )
         if abs(scale) > precision:
             raise ValueError(f"scale={scale} exceeds precision={precision}")
+
+
+class IntervalDtype(StructDtype):
+    name = "interval"
+
+    def __init__(self, subtype, closed="right"):
+        """
+        subtype: str, np.dtype
+            The dtype of the Interval bounds.
+        closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’
+            Whether the interval is closed on the left-side, right-side,
+            both or neither. See the Notes for more detailed explanation.
+        """
+        super().__init__(fields={"left": subtype, "right": subtype})
+
+        if closed in ["left", "right", "neither", "both"]:
+            self.closed = closed
+        else:
+            raise ValueError("closed value is not valid")
+
+    @property
+    def subtype(self):
+        return self.fields["left"]
+
+    def __repr__(self):
+        return f"interval[{self.fields['left']}]"
+
+    @classmethod
+    def from_arrow(cls, typ):
+        return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed)
+
+    def to_arrow(self):
+
+        return ArrowIntervalType(
+            pa.from_numpy_dtype(self.subtype), self.closed
+        )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -2403,6 +2403,18 @@ def _copy_type_metadata(
 
         return self
 
+    def _copy_interval_data(self, other, include_index=True):
+        for name, col, other_col in zip(
+            self._data.keys(), self._data.values(), other._data.values()
+        ):
+            if isinstance(other_col, cudf.core.column.IntervalColumn):
+                self._data[name] = cudf.core.column.IntervalColumn(col)
+
+    def _postprocess_columns(self, other, include_index=True):
+        self._copy_categories(other, include_index=include_index)
+        self._copy_struct_names(other, include_index=include_index)
+        self._copy_interval_data(other, include_index=include_index)
+
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         data = zip(self._column_names, data_columns)

diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
@@ -11,6 +11,7 @@
     Decimal64Dtype,
     ListDtype,
     StructDtype,
+    IntervalDtype,
 )
 from cudf.tests.utils import assert_eq
 
@@ -145,3 +146,12 @@ def test_max_precision():
     Decimal64Dtype(scale=0, precision=18)
     with pytest.raises(ValueError):
         Decimal64Dtype(scale=0, precision=19)
+
+
+@pytest.mark.parametrize("fields", ["int64", "int32"])
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+def test_interval_dtype_pyarrow_round_trip(fields, closed):
+    pa_array = pd.core.arrays._arrow_utils.ArrowIntervalType(fields, closed)
+    expect = pa_array
+    got = IntervalDtype.from_arrow(expect).to_arrow()
+    assert expect.equals(got)