Skip to content

Commit

Permalink
Interval index and interval_range (rapidsai#7182)
Browse files Browse the repository at this point in the history
This PR is a follow up to PR rapidsai#6984. It adds the IntervalIndex and interval_range classes to CuDF. IntervalIndex is needed for the `cut` and `histogram` methods and closes issue rapidsai#5376.   

The main files to take note of in this PR are `index.py` and the two cudf `__init__.py` files. The rest of the file changes are related to PR6984 for creating the interval dtype.

Authors:
  - Marlene  (https://github.com/marlenezw)
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)

URL: rapidsai#7182
  • Loading branch information
marlenezw authored and shwina committed Apr 7, 2021
1 parent 6338e9c commit 9d52c8a
Show file tree
Hide file tree
Showing 7 changed files with 591 additions and 16 deletions.
2 changes: 2 additions & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from cudf.core import (
NA,
CategoricalIndex,
interval_range,
IntervalIndex,
DataFrame,
DatetimeIndex,
Float32Index,
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from cudf.core.dataframe import DataFrame, from_pandas, merge
from cudf.core.index import (
CategoricalIndex,
interval_range,
IntervalIndex,
DatetimeIndex,
Float32Index,
Float64Index,
Expand Down
69 changes: 63 additions & 6 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from cudf.core.abc import Serializable
from cudf.core.buffer import Buffer
from cudf.core.dtypes import CategoricalDtype
from cudf.core.dtypes import IntervalDtype
from cudf.utils import ioutils, utils
from cudf.utils.dtypes import (
NUMERIC_TYPES,
Expand Down Expand Up @@ -1046,11 +1047,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
)
return self
elif is_interval_dtype(self.dtype):
if not self.dtype == dtype:
raise NotImplementedError(
"Casting interval columns not currently supported"
)
return self
return self.as_interval_column(dtype, **kwargs)
elif is_decimal_dtype(dtype):
return self.as_decimal_column(dtype, **kwargs)
elif np.issubdtype(dtype, np.datetime64):
Expand Down Expand Up @@ -1113,6 +1110,11 @@ def as_datetime_column(
) -> "cudf.core.column.DatetimeColumn":
raise NotImplementedError

def as_interval_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.IntervalColumn":
raise NotImplementedError

def as_timedelta_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.TimeDeltaColumn":
Expand Down Expand Up @@ -1633,6 +1635,15 @@ def build_column(
null_count=null_count,
children=children,
)
elif is_interval_dtype(dtype):
return cudf.core.column.IntervalColumn(
dtype=dtype,
mask=mask,
size=size,
offset=offset,
children=children,
null_count=null_count,
)
elif is_struct_dtype(dtype):
if size is None:
raise TypeError("Must specify size")
Expand Down Expand Up @@ -1714,6 +1725,52 @@ def build_categorical_column(
return cast("cudf.core.column.CategoricalColumn", result)


def build_interval_column(
left_col,
right_col,
mask=None,
size=None,
offset=0,
null_count=None,
closed="right",
):
"""
Build an IntervalColumn
Parameters
----------
left_col : Column
Column of values representing the left of the interval
right_col : Column
Column of representing the right of the interval
mask : Buffer
Null mask
size : int, optional
offset : int, optional
closed : {"left", "right", "both", "neither"}, default "right"
Whether the intervals are closed on the left-side, right-side,
both or neither.
"""
left = as_column(left_col)
right = as_column(right_col)
if closed not in {"left", "right", "both", "neither"}:
closed = "right"
if type(left_col) is not list:
dtype = IntervalDtype(left_col.dtype, closed)
else:
dtype = IntervalDtype("int64", closed)
size = len(left)
return build_column(
data=None,
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=(left, right),
)


def as_column(
arbitrary: Any,
nan_as_null: bool = None,
Expand Down Expand Up @@ -2102,7 +2159,7 @@ def as_column(
data = as_column(sr, nan_as_null=nan_as_null)
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null)
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
else:
data = as_column(
_construct_array(arbitrary, dtype),
Expand Down
37 changes: 29 additions & 8 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pyarrow as pa
import cudf
from cudf.core.column import StructColumn
from cudf.core.dtypes import IntervalDtype
from cudf.utils.dtypes import is_interval_dtype


class IntervalColumn(StructColumn):
Expand Down Expand Up @@ -38,7 +40,7 @@ def closed(self):
def from_arrow(self, data):
new_col = super().from_arrow(data.storage)
size = len(data)
dtype = cudf.core.dtypes.IntervalDtype.from_arrow(data.type)
dtype = IntervalDtype.from_arrow(data.type)
mask = data.buffers()[0]
if mask is not None:
mask = cudf.utils.utils.pa_mask_buffer_to_mask(mask, len(data))
Expand All @@ -60,14 +62,17 @@ def from_arrow(self, data):

def to_arrow(self):
typ = self.dtype.to_arrow()
return pa.ExtensionArray.from_storage(typ, super().to_arrow())
struct_arrow = super().to_arrow()
if len(struct_arrow) == 0:
# struct arrow is pa.struct array with null children types
# we need to make sure its children have non-null type
struct_arrow = pa.array([], typ.storage_type)
return pa.ExtensionArray.from_storage(typ, struct_arrow)

def from_struct_column(self, closed="right"):
return IntervalColumn(
size=self.size,
dtype=cudf.core.dtypes.IntervalDtype(
self.dtype.fields["left"], closed
),
dtype=IntervalDtype(self.dtype.fields["left"], closed),
mask=self.base_mask,
offset=self.offset,
null_count=self.null_count,
Expand All @@ -80,12 +85,28 @@ def copy(self, deep=True):
struct_copy = super().copy(deep=deep)
return IntervalColumn(
size=struct_copy.size,
dtype=cudf.core.dtypes.IntervalDtype(
struct_copy.dtype.fields["left"], closed
),
dtype=IntervalDtype(struct_copy.dtype.fields["left"], closed),
mask=struct_copy.base_mask,
offset=struct_copy.offset,
null_count=struct_copy.null_count,
children=struct_copy.base_children,
closed=closed,
)

def as_interval_column(self, dtype, **kwargs):
if is_interval_dtype(dtype):
# a user can directly input the string `interval` as the dtype
# when creating an interval series or interval dataframe
if dtype == "interval":
dtype = IntervalDtype(self.dtype.fields["left"], self.closed)
return IntervalColumn(
size=self.size,
dtype=dtype,
mask=self.mask,
offset=self.offset,
null_count=self.null_count,
children=self.children,
closed=dtype.closed,
)
else:
raise ValueError("dtype must be IntervalDtype")
Loading

0 comments on commit 9d52c8a

Please sign in to comment.