rapidsai · rapids-bot · Feb 19, 2021 · Dec 11, 2020 · Dec 11, 2020 · Dec 14, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,43 @@
 
 ## Bug Fixes
 
+# cuDF 0.18.0 (Date TBD)
+
+## New Features
+- PR #6856 Add groupby idxmin, idxmax aggregation
+- PR #6847 Add a cmake find module for cuFile in JNI code
+- PR #6902 Implement `DataFrame.quantile` for `datetime` and `timedelta` data types
+- PR #6814 Implement `cudf::reduce` for `decimal32` and `decimal64` (part 1)
+- PR #6929 Add `Index.set_names` api
+- PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
+- PR #6984 Add `interval` dtype to cudf
+- PR #6885 Share `factorize` implementation with Index and cudf module
+- PR #6775 Implement cudf.DateOffset for months
+- PR #7039 Support contains() on lists of primitives
+
+## Improvements
+
+- PR #6938 Pass numeric scalars of the same dtype through numeric binops
+- PR #6275 Update to official libcu++ on Github
+- PR #6838 Fix `columns` & `index` handling in dataframe constructor
+- PR #6750 Remove **kwargs from string/categorical methods
+- PR #6585 Add dictionary support to libcudf groupby functions
+- PR #6909 Support reading byte array backed decimal columns from parquet files
+- PR #6939 Use simplified `rmm::exec_policy`
+- PR #6512 Refactor rolling.cu to reduce compile time
+- PR #6982 Disable some pragma unroll statements in thrust `sort.h`
+- PR #7051 Verify decimal cast in java package
+- PR #7120 Verify window operations on decimal in java package
+
+## Bug Fixes
+
+- PR #6884 Correct the sampling range when sampling with replacement
+- PR #6903 Add null count test for apply_boolean_mask
+- PR #6922 Fix N/A detection for empty fields in CSV reader
+- PR #6912 Fix rmm_mode=managed parameter for gtests
+- PR #6943 Fix join with nulls not equal performance
+- PR #6945 Fix groupby agg/apply behaviour when no key columns are provided 
+- PR #6942 Fix cudf::merge gtest for dictionary columns
 # 0.18.0
 
 Please see https://github.com/rapidsai/cudf/releases/tag/branch-0.18-latest for the latest changes to this development branch.

@@ -21,4 +21,5 @@
 from cudf.core.column.string import StringColumn  # noqa: F401
 from cudf.core.column.struct import StructColumn  # noqa: F401
 from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
+from cudf.core.column.interval import IntervalColumn  # noqa: F401
 from cudf.core.column.decimal import DecimalColumn  # noqa: F401
@@ -54,6 +54,7 @@
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
+    is_interval_dtype,
     min_signed_type,
     min_unsigned_type,
     np_to_pa_dtype,
@@ -117,6 +118,10 @@ def to_pandas(
             pd_series = pd.Series(pandas_array, copy=False)
         elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
             pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
+        elif is_interval_dtype(self.dtype):
+            pd_series = pd.Series(
+                pd.IntervalDtype().__from_arrow__(self.to_arrow())
+            )
         else:
             pd_series = self.to_arrow().to_pandas(**kwargs)
 
@@ -370,7 +375,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         if not isinstance(array, (pa.Array, pa.ChunkedArray)):
             raise TypeError("array should be PyArrow array or chunked array")
-
         data = pa.table([array], [None])
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
@@ -406,6 +410,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             )
         elif isinstance(array.type, pa.StructType):
             return cudf.core.column.StructColumn.from_arrow(array)
+        elif isinstance(
+            array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
+        ):
+            return cudf.core.column.IntervalColumn.from_arrow(array)
 
         return libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
@@ -1001,6 +1009,12 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
                     "Casting list columns not currently supported"
                 )
             return self
+        elif is_interval_dtype(self.dtype):
+            if not self.dtype == dtype:
+                raise NotImplementedError(
+                    "Casting interval columns not currently supported"
+                )
+            return self
         elif np.issubdtype(dtype, np.datetime64):
             return self.as_datetime_column(dtype, **kwargs)
         elif np.issubdtype(dtype, np.timedelta64):
@@ -1581,6 +1595,16 @@ def build_column(
             null_count=null_count,
             children=children,
         )
+    elif is_interval_dtype(dtype):
+        return cudf.core.column.IntervalColumn(
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
     else:
         assert data is not None
         return cudf.core.column.NumericalColumn(
@@ -1619,7 +1643,6 @@ def build_categorical_column(
     ordered : bool
         Indicates whether the categories are ordered
     """
-
     codes_dtype = min_unsigned_type(len(categories))
     codes = as_column(codes)
     if codes.dtype != codes_dtype:
@@ -1765,6 +1788,8 @@ def as_column(
             return as_column(arbitrary.array)
         if is_categorical_dtype(arbitrary):
             data = as_column(pa.array(arbitrary, from_pandas=True))
+        elif is_interval_dtype(arbitrary.dtype):
+            data = as_column(pa.array(arbitrary, from_pandas=True))
         elif arbitrary.dtype == np.bool_:
             data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
         elif arbitrary.dtype.kind in ("f"):
@@ -1886,6 +1911,18 @@ def as_column(
                 mask=mask,
                 dtype=arbitrary.dtype,
             )
+        elif (
+            len(arbitrary) > 0
+            and arb_dtype.kind in ("O")
+            and isinstance(arbitrary[0], pd._libs.interval.Interval)
+        ):
+            # changing from pd array to series,possible arrow bug
+            interval_series = pd.Series(arbitrary)
+            data = as_column(
+                pa.Array.from_pandas(interval_series), dtype=arbitrary.dtype,
+            )
+            if dtype is not None:
+                data = data.astype(dtype)
         elif arb_dtype.kind in ("O", "U"):
             data = as_column(
                 pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
@@ -1916,7 +1953,17 @@ def as_column(
                 arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
                 if arb_dtype != arbitrary.dtype.numpy_dtype:
                     arbitrary = arbitrary.astype(arb_dtype)
-        if arb_dtype.kind in ("O", "U"):
+        if (
+            len(arbitrary) > 0
+            and isinstance(arbitrary[0], pd._libs.interval.Interval)
+            and arb_dtype.kind in ("O")
+        ):
+            # changing from pd array to series,possible arrow bug
+            interval_series = pd.Series(arbitrary)
+            data = as_column(
+                pa.Array.from_pandas(interval_series), dtype=arb_dtype
+            )
+        elif arb_dtype.kind in ("O", "U"):
             data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
         else:
             data = as_column(
@@ -1971,7 +2018,7 @@ def as_column(
                         )
                         return cudf.core.column.DecimalColumn.from_arrow(data)
                     dtype = pd.api.types.pandas_dtype(dtype)
-                    if is_categorical_dtype(dtype):
+                    if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
                         raise TypeError
                     else:
                         np_type = np.dtype(dtype).type
@@ -1997,6 +2044,9 @@ def as_column(
                 elif np_type == np.str_:
                     sr = pd.Series(arbitrary, dtype="str")
                     data = as_column(sr, nan_as_null=nan_as_null)
+                elif is_interval_dtype(dtype):
+                    sr = pd.Series(arbitrary, dtype="interval")
+                    data = as_column(sr, nan_as_null=nan_as_null)
                 else:
                     data = as_column(
                         _construct_array(arbitrary, dtype),

@@ -0,0 +1,91 @@
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+import pyarrow as pa
+import cudf
+from cudf.core.column import StructColumn
+
+
+class IntervalColumn(StructColumn):
+    def __init__(
+        self,
+        dtype,
+        mask=None,
+        size=None,
+        offset=0,
+        null_count=None,
+        children=(),
+        closed="right",
+    ):
+
+        super().__init__(
+            data=None,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+        self._closed = closed
+
+    @property
+    def closed(self):
+        if self._closed in ["left", "right", "neither", "both"]:
+            return self._closed
+        else:
+            raise ValueError("closed value is not valid")
+
+    @classmethod
+    def from_arrow(self, data):
+        new_col = super().from_arrow(data.storage)
+        size = len(data)
+        dtype = cudf.core.dtypes.IntervalDtype.from_arrow(data.type)
+        mask = data.buffers()[0]
+        if mask is not None:
+            mask = cudf.utils.utils.pa_mask_buffer_to_mask(mask, len(data))
+
+        offset = data.offset
+        null_count = data.null_count
+        children = new_col.children
+        closed = dtype.closed
+
+        return IntervalColumn(
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+            closed=closed,
+        )
+
+    def to_arrow(self):
+        typ = self.dtype.to_arrow()
+        return pa.ExtensionArray.from_storage(typ, super().to_arrow())
+
+    def from_struct_column(self, closed="right"):
+        return IntervalColumn(
+            size=self.size,
+            dtype=cudf.core.dtypes.IntervalDtype(
+                self.dtype.fields["left"], closed
+            ),
+            mask=self.mask,
+            offset=self.offset,
+            null_count=self.null_count,
+            children=self.children,
+            closed=closed,
+        )
+
+    def copy(self, deep=True):
+        closed = self.closed
+        struct_copy = super().copy(deep=deep)
+        return IntervalColumn(
+            size=struct_copy.size,
+            dtype=cudf.core.dtypes.IntervalDtype(
+                struct_copy.dtype.fields["left"], closed
+            ),
+            mask=struct_copy.mask,
+            offset=struct_copy.offset,
+            null_count=struct_copy.null_count,
+            children=struct_copy.children,
+            closed=closed,
+        )
@@ -409,11 +409,14 @@ def _init_from_list_like(self, data, index=None, columns=None):
             index = as_index(index)
 
         self._index = as_index(index)
-
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
             self._data = data._data
+        # interval in a list
+        elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval):
+            data = DataFrame.from_pandas(pd.DataFrame(data))
+            self._data = data._data
         else:
             data = list(itertools.zip_longest(*data))
 

@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 from pandas.api.extensions import ExtensionDtype
+from pandas.core.arrays._arrow_utils import ArrowIntervalType
 
 import cudf
 from cudf._typing import Dtype
@@ -219,7 +220,7 @@ def __eq__(self, other):
         return self._typ.equals(other._typ)
 
     def __repr__(self):
-        return f"StructDtype({self.fields})"
+        return f"{type(self).__name__}({self.fields})"
 
     def __hash__(self):
         return hash(self._typ)
@@ -304,3 +305,32 @@ def _validate(cls, precision, scale=0):
             )
         if abs(scale) > precision:
             raise ValueError(f"scale={scale} exceeds precision={precision}")
+
+
+class IntervalDtype(StructDtype):
+    name = "interval"
+
+    def __init__(self, subtype, closed="right"):
+        """
+        subtype: str, np.dtype
+            The dtype of the Interval bounds.
+        """
+        super().__init__(fields={"left": subtype, "right": subtype})
+        self.closed = closed
+
+    @property
+    def subtype(self):
+        return self.fields["left"]
+
+    def __repr__(self):
+        return f"interval[{self.fields['left']}]"
+
+    @classmethod
+    def from_arrow(cls, typ):
+        return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed)
+
+    def to_arrow(self):
+
+        return ArrowIntervalType(
+            pa.from_numpy_dtype(self.subtype), self.closed
+        )
@@ -2403,6 +2403,19 @@ def _copy_type_metadata(
 
         return self
 
+    def _copy_interval_data(self, other, include_index=True):
+        for name, col, other_col in zip(
+            self._data.keys(), self._data.values(), other._data.values()
+        ):
+            if isinstance(other_col, cudf.core.column.IntervalColumn):
+                self._data[name] = cudf.core.column.IntervalColumn(col)
+                # col.as_interval_column()
+
+    def _postprocess_columns(self, other, include_index=True):
+        self._copy_categories(other, include_index=include_index)
+        self._copy_struct_names(other, include_index=include_index)
+        self._copy_interval_data(other, include_index=include_index)
+
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         data = zip(self._column_names, data_columns)

@@ -11,6 +11,7 @@
     Decimal64Dtype,
     ListDtype,
     StructDtype,
+    IntervalDtype,
 )
 from cudf.tests.utils import assert_eq
 
@@ -145,3 +146,12 @@ def test_max_precision():
     Decimal64Dtype(scale=0, precision=18)
     with pytest.raises(ValueError):
         Decimal64Dtype(scale=0, precision=19)
+
+
+@pytest.mark.parametrize("fields", ["int64", "int32"])
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+def test_interval_dtype_pyarrow_round_trip(fields, closed):
+    pa_array = pd.core.arrays._arrow_utils.ArrowIntervalType(fields, closed)
+    expect = pa_array
+    got = IntervalDtype.from_arrow(expect).to_arrow()
+    assert expect.equals(got)