Skip to content

Commit

Permalink
Adding Interval Dtype (#6984)
Browse files Browse the repository at this point in the history
This PR is intended to add the Interval Dtype to CuDF. Interval is an ExtensionDtype for Interval data. This PR closes #5376.

Authors:
  - Marlene  (@marlenezw)

Approvers:
  - Keith Kraus (@kkraus14)

URL: #6984
  • Loading branch information
marlenezw authored Feb 19, 2021
1 parent 593dc1c commit 0d03d9f
Show file tree
Hide file tree
Showing 9 changed files with 354 additions and 7 deletions.
1 change: 1 addition & 0 deletions python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
from cudf.core.column.string import StringColumn # noqa: F401
from cudf.core.column.struct import StructColumn # noqa: F401
from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401
from cudf.core.column.interval import IntervalColumn # noqa: F401
from cudf.core.column.decimal import DecimalColumn # noqa: F401
57 changes: 53 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
is_scalar,
is_string_dtype,
is_struct_dtype,
is_interval_dtype,
min_signed_type,
min_unsigned_type,
np_to_pa_dtype,
Expand Down Expand Up @@ -117,6 +118,10 @@ def to_pandas(
pd_series = pd.Series(pandas_array, copy=False)
elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
elif is_interval_dtype(self.dtype):
pd_series = pd.Series(
pd.IntervalDtype().__from_arrow__(self.to_arrow())
)
else:
pd_series = self.to_arrow().to_pandas(**kwargs)

Expand Down Expand Up @@ -370,7 +375,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
"""
if not isinstance(array, (pa.Array, pa.ChunkedArray)):
raise TypeError("array should be PyArrow array or chunked array")

data = pa.table([array], [None])
if isinstance(array.type, pa.DictionaryType):
indices_table = pa.table(
Expand Down Expand Up @@ -406,6 +410,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
)
elif isinstance(array.type, pa.StructType):
return cudf.core.column.StructColumn.from_arrow(array)
elif isinstance(
array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
):
return cudf.core.column.IntervalColumn.from_arrow(array)

return libcudf.interop.from_arrow(data, data.column_names)._data[
"None"
Expand Down Expand Up @@ -1001,6 +1009,12 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
"Casting list columns not currently supported"
)
return self
elif is_interval_dtype(self.dtype):
if not self.dtype == dtype:
raise NotImplementedError(
"Casting interval columns not currently supported"
)
return self
elif np.issubdtype(dtype, np.datetime64):
return self.as_datetime_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.timedelta64):
Expand Down Expand Up @@ -1581,6 +1595,15 @@ def build_column(
null_count=null_count,
children=children,
)
elif is_interval_dtype(dtype):
return cudf.core.column.IntervalColumn(
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=children,
)
else:
assert data is not None
return cudf.core.column.NumericalColumn(
Expand Down Expand Up @@ -1619,7 +1642,6 @@ def build_categorical_column(
ordered : bool
Indicates whether the categories are ordered
"""

codes_dtype = min_unsigned_type(len(categories))
codes = as_column(codes)
if codes.dtype != codes_dtype:
Expand Down Expand Up @@ -1765,6 +1787,8 @@ def as_column(
return as_column(arbitrary.array)
if is_categorical_dtype(arbitrary):
data = as_column(pa.array(arbitrary, from_pandas=True))
elif is_interval_dtype(arbitrary.dtype):
data = as_column(pa.array(arbitrary, from_pandas=True))
elif arbitrary.dtype == np.bool_:
data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
elif arbitrary.dtype.kind in ("f"):
Expand Down Expand Up @@ -1886,6 +1910,18 @@ def as_column(
mask=mask,
dtype=arbitrary.dtype,
)
elif (
arbitrary.size != 0
and arb_dtype.kind in ("O")
and isinstance(arbitrary[0], pd._libs.interval.Interval)
):
# changing from pd array to series,possible arrow bug
interval_series = pd.Series(arbitrary)
data = as_column(
pa.Array.from_pandas(interval_series), dtype=arbitrary.dtype,
)
if dtype is not None:
data = data.astype(dtype)
elif arb_dtype.kind in ("O", "U"):
data = as_column(
pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
Expand Down Expand Up @@ -1916,7 +1952,17 @@ def as_column(
arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
if arb_dtype != arbitrary.dtype.numpy_dtype:
arbitrary = arbitrary.astype(arb_dtype)
if arb_dtype.kind in ("O", "U"):
if (
arbitrary.size != 0
and isinstance(arbitrary[0], pd._libs.interval.Interval)
and arb_dtype.kind in ("O")
):
# changing from pd array to series,possible arrow bug
interval_series = pd.Series(arbitrary)
data = as_column(
pa.Array.from_pandas(interval_series), dtype=arb_dtype
)
elif arb_dtype.kind in ("O", "U"):
data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
else:
data = as_column(
Expand Down Expand Up @@ -1971,7 +2017,7 @@ def as_column(
)
return cudf.core.column.DecimalColumn.from_arrow(data)
dtype = pd.api.types.pandas_dtype(dtype)
if is_categorical_dtype(dtype):
if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
raise TypeError
else:
np_type = np.dtype(dtype).type
Expand All @@ -1997,6 +2043,9 @@ def as_column(
elif np_type == np.str_:
sr = pd.Series(arbitrary, dtype="str")
data = as_column(sr, nan_as_null=nan_as_null)
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null)
else:
data = as_column(
_construct_array(arbitrary, dtype),
Expand Down
91 changes: 91 additions & 0 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
import pyarrow as pa
import cudf
from cudf.core.column import StructColumn


class IntervalColumn(StructColumn):
def __init__(
self,
dtype,
mask=None,
size=None,
offset=0,
null_count=None,
children=(),
closed="right",
):

super().__init__(
data=None,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=children,
)
if closed in ["left", "right", "neither", "both"]:
self._closed = closed
else:
raise ValueError("closed value is not valid")

@property
def closed(self):
return self._closed

@classmethod
def from_arrow(self, data):
new_col = super().from_arrow(data.storage)
size = len(data)
dtype = cudf.core.dtypes.IntervalDtype.from_arrow(data.type)
mask = data.buffers()[0]
if mask is not None:
mask = cudf.utils.utils.pa_mask_buffer_to_mask(mask, len(data))

offset = data.offset
null_count = data.null_count
children = new_col.children
closed = dtype.closed

return IntervalColumn(
size=size,
dtype=dtype,
mask=mask,
offset=offset,
null_count=null_count,
children=children,
closed=closed,
)

def to_arrow(self):
typ = self.dtype.to_arrow()
return pa.ExtensionArray.from_storage(typ, super().to_arrow())

def from_struct_column(self, closed="right"):
return IntervalColumn(
size=self.size,
dtype=cudf.core.dtypes.IntervalDtype(
self.dtype.fields["left"], closed
),
mask=self.base_mask,
offset=self.offset,
null_count=self.null_count,
children=self.base_children,
closed=closed,
)

def copy(self, deep=True):
closed = self.closed
struct_copy = super().copy(deep=deep)
return IntervalColumn(
size=struct_copy.size,
dtype=cudf.core.dtypes.IntervalDtype(
struct_copy.dtype.fields["left"], closed
),
mask=struct_copy.base_mask,
offset=struct_copy.offset,
null_count=struct_copy.null_count,
children=struct_copy.base_children,
closed=closed,
)
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,11 +409,14 @@ def _init_from_list_like(self, data, index=None, columns=None):
index = as_index(index)

self._index = as_index(index)

# list-of-dicts case
if len(data) > 0 and isinstance(data[0], dict):
data = DataFrame.from_pandas(pd.DataFrame(data))
self._data = data._data
# interval in a list
elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval):
data = DataFrame.from_pandas(pd.DataFrame(data))
self._data = data._data
else:
data = list(itertools.zip_longest(*data))

Expand Down
39 changes: 38 additions & 1 deletion python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import pyarrow as pa
from pandas.api.extensions import ExtensionDtype
from pandas.core.arrays._arrow_utils import ArrowIntervalType

import cudf
from cudf._typing import Dtype
Expand Down Expand Up @@ -219,7 +220,7 @@ def __eq__(self, other):
return self._typ.equals(other._typ)

def __repr__(self):
return f"StructDtype({self.fields})"
return f"{type(self).__name__}({self.fields})"

def __hash__(self):
return hash(self._typ)
Expand Down Expand Up @@ -304,3 +305,39 @@ def _validate(cls, precision, scale=0):
)
if abs(scale) > precision:
raise ValueError(f"scale={scale} exceeds precision={precision}")


class IntervalDtype(StructDtype):
name = "interval"

def __init__(self, subtype, closed="right"):
"""
subtype: str, np.dtype
The dtype of the Interval bounds.
closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’
Whether the interval is closed on the left-side, right-side,
both or neither. See the Notes for more detailed explanation.
"""
super().__init__(fields={"left": subtype, "right": subtype})

if closed in ["left", "right", "neither", "both"]:
self.closed = closed
else:
raise ValueError("closed value is not valid")

@property
def subtype(self):
return self.fields["left"]

def __repr__(self):
return f"interval[{self.fields['left']}]"

@classmethod
def from_arrow(cls, typ):
return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed)

def to_arrow(self):

return ArrowIntervalType(
pa.from_numpy_dtype(self.subtype), self.closed
)
12 changes: 12 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2403,6 +2403,18 @@ def _copy_type_metadata(

return self

def _copy_interval_data(self, other, include_index=True):
for name, col, other_col in zip(
self._data.keys(), self._data.values(), other._data.values()
):
if isinstance(other_col, cudf.core.column.IntervalColumn):
self._data[name] = cudf.core.column.IntervalColumn(col)

def _postprocess_columns(self, other, include_index=True):
self._copy_categories(other, include_index=include_index)
self._copy_struct_names(other, include_index=include_index)
self._copy_interval_data(other, include_index=include_index)

def _unaryop(self, op):
data_columns = (col.unary_operator(op) for col in self._columns)
data = zip(self._column_names, data_columns)
Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Decimal64Dtype,
ListDtype,
StructDtype,
IntervalDtype,
)
from cudf.tests.utils import assert_eq

Expand Down Expand Up @@ -145,3 +146,12 @@ def test_max_precision():
Decimal64Dtype(scale=0, precision=18)
with pytest.raises(ValueError):
Decimal64Dtype(scale=0, precision=19)


@pytest.mark.parametrize("fields", ["int64", "int32"])
@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
def test_interval_dtype_pyarrow_round_trip(fields, closed):
pa_array = pd.core.arrays._arrow_utils.ArrowIntervalType(fields, closed)
expect = pa_array
got = IntervalDtype.from_arrow(expect).to_arrow()
assert expect.equals(got)
Loading

0 comments on commit 0d03d9f

Please sign in to comment.