diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 81dab52d353..e0aa9471a2f 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -21,4 +21,5 @@ from cudf.core.column.string import StringColumn # noqa: F401 from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 +from cudf.core.column.interval import IntervalColumn # noqa: F401 from cudf.core.column.decimal import DecimalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d615a7cfae4..ed9d54a1283 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -54,6 +54,7 @@ is_scalar, is_string_dtype, is_struct_dtype, + is_interval_dtype, min_signed_type, min_unsigned_type, np_to_pa_dtype, @@ -117,6 +118,10 @@ def to_pandas( pd_series = pd.Series(pandas_array, copy=False) elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0: pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) + elif is_interval_dtype(self.dtype): + pd_series = pd.Series( + pd.IntervalDtype().__from_arrow__(self.to_arrow()) + ) else: pd_series = self.to_arrow().to_pandas(**kwargs) @@ -370,7 +375,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: """ if not isinstance(array, (pa.Array, pa.ChunkedArray)): raise TypeError("array should be PyArrow array or chunked array") - data = pa.table([array], [None]) if isinstance(array.type, pa.DictionaryType): indices_table = pa.table( @@ -406,6 +410,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: ) elif isinstance(array.type, pa.StructType): return cudf.core.column.StructColumn.from_arrow(array) + elif isinstance( + array.type, pd.core.arrays._arrow_utils.ArrowIntervalType + ): + return cudf.core.column.IntervalColumn.from_arrow(array) return libcudf.interop.from_arrow(data, data.column_names)._data[ "None" @@ -1001,6 +1009,12 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: "Casting list columns not currently supported" ) return self + elif is_interval_dtype(self.dtype): + if not self.dtype == dtype: + raise NotImplementedError( + "Casting interval columns not currently supported" + ) + return self elif np.issubdtype(dtype, np.datetime64): return self.as_datetime_column(dtype, **kwargs) elif np.issubdtype(dtype, np.timedelta64): @@ -1581,6 +1595,15 @@ def build_column( null_count=null_count, children=children, ) + elif is_interval_dtype(dtype): + return cudf.core.column.IntervalColumn( + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + children=children, + ) else: assert data is not None return cudf.core.column.NumericalColumn( @@ -1619,7 +1642,6 @@ def build_categorical_column( ordered : bool Indicates whether the categories are ordered """ - codes_dtype = min_unsigned_type(len(categories)) codes = as_column(codes) if codes.dtype != codes_dtype: @@ -1765,6 +1787,8 @@ def as_column( return as_column(arbitrary.array) if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) + elif is_interval_dtype(arbitrary.dtype): + data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool_: data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype) elif arbitrary.dtype.kind in ("f"): @@ -1886,6 +1910,18 @@ def as_column( mask=mask, dtype=arbitrary.dtype, ) + elif ( + arbitrary.size != 0 + and arb_dtype.kind in ("O") + and isinstance(arbitrary[0], pd._libs.interval.Interval) + ): + # changing from pd array to series,possible arrow bug + interval_series = pd.Series(arbitrary) + data = as_column( + pa.Array.from_pandas(interval_series), dtype=arbitrary.dtype, + ) + if dtype is not None: + data = data.astype(dtype) elif arb_dtype.kind in ("O", "U"): data = as_column( pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype @@ -1916,7 +1952,17 @@ def as_column( arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) if arb_dtype != arbitrary.dtype.numpy_dtype: arbitrary = arbitrary.astype(arb_dtype) - if arb_dtype.kind in ("O", "U"): + if ( + arbitrary.size != 0 + and isinstance(arbitrary[0], pd._libs.interval.Interval) + and arb_dtype.kind in ("O") + ): + # changing from pd array to series,possible arrow bug + interval_series = pd.Series(arbitrary) + data = as_column( + pa.Array.from_pandas(interval_series), dtype=arb_dtype + ) + elif arb_dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype) else: data = as_column( @@ -1971,7 +2017,7 @@ def as_column( ) return cudf.core.column.DecimalColumn.from_arrow(data) dtype = pd.api.types.pandas_dtype(dtype) - if is_categorical_dtype(dtype): + if is_categorical_dtype(dtype) or is_interval_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type @@ -1997,6 +2043,9 @@ def as_column( elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) + elif is_interval_dtype(dtype): + sr = pd.Series(arbitrary, dtype="interval") + data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( _construct_array(arbitrary, dtype), diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py new file mode 100644 index 00000000000..e9991bef071 --- /dev/null +++ b/python/cudf/cudf/core/column/interval.py @@ -0,0 +1,91 @@ +# Copyright (c) 2018-2021, NVIDIA CORPORATION. +import pyarrow as pa +import cudf +from cudf.core.column import StructColumn + + +class IntervalColumn(StructColumn): + def __init__( + self, + dtype, + mask=None, + size=None, + offset=0, + null_count=None, + children=(), + closed="right", + ): + + super().__init__( + data=None, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + children=children, + ) + if closed in ["left", "right", "neither", "both"]: + self._closed = closed + else: + raise ValueError("closed value is not valid") + + @property + def closed(self): + return self._closed + + @classmethod + def from_arrow(self, data): + new_col = super().from_arrow(data.storage) + size = len(data) + dtype = cudf.core.dtypes.IntervalDtype.from_arrow(data.type) + mask = data.buffers()[0] + if mask is not None: + mask = cudf.utils.utils.pa_mask_buffer_to_mask(mask, len(data)) + + offset = data.offset + null_count = data.null_count + children = new_col.children + closed = dtype.closed + + return IntervalColumn( + size=size, + dtype=dtype, + mask=mask, + offset=offset, + null_count=null_count, + children=children, + closed=closed, + ) + + def to_arrow(self): + typ = self.dtype.to_arrow() + return pa.ExtensionArray.from_storage(typ, super().to_arrow()) + + def from_struct_column(self, closed="right"): + return IntervalColumn( + size=self.size, + dtype=cudf.core.dtypes.IntervalDtype( + self.dtype.fields["left"], closed + ), + mask=self.base_mask, + offset=self.offset, + null_count=self.null_count, + children=self.base_children, + closed=closed, + ) + + def copy(self, deep=True): + closed = self.closed + struct_copy = super().copy(deep=deep) + return IntervalColumn( + size=struct_copy.size, + dtype=cudf.core.dtypes.IntervalDtype( + struct_copy.dtype.fields["left"], closed + ), + mask=struct_copy.base_mask, + offset=struct_copy.offset, + null_count=struct_copy.null_count, + children=struct_copy.base_children, + closed=closed, + ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 59e7a0a7a8a..3e7e6625abe 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -409,11 +409,14 @@ def _init_from_list_like(self, data, index=None, columns=None): index = as_index(index) self._index = as_index(index) - # list-of-dicts case if len(data) > 0 and isinstance(data[0], dict): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data + # interval in a list + elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval): + data = DataFrame.from_pandas(pd.DataFrame(data)) + self._data = data._data else: data = list(itertools.zip_longest(*data)) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a8bbc5ee7ff..f11f3692faf 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa from pandas.api.extensions import ExtensionDtype +from pandas.core.arrays._arrow_utils import ArrowIntervalType import cudf from cudf._typing import Dtype @@ -219,7 +220,7 @@ def __eq__(self, other): return self._typ.equals(other._typ) def __repr__(self): - return f"StructDtype({self.fields})" + return f"{type(self).__name__}({self.fields})" def __hash__(self): return hash(self._typ) @@ -304,3 +305,39 @@ def _validate(cls, precision, scale=0): ) if abs(scale) > precision: raise ValueError(f"scale={scale} exceeds precision={precision}") + + +class IntervalDtype(StructDtype): + name = "interval" + + def __init__(self, subtype, closed="right"): + """ + subtype: str, np.dtype + The dtype of the Interval bounds. + closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’ + Whether the interval is closed on the left-side, right-side, + both or neither. See the Notes for more detailed explanation. + """ + super().__init__(fields={"left": subtype, "right": subtype}) + + if closed in ["left", "right", "neither", "both"]: + self.closed = closed + else: + raise ValueError("closed value is not valid") + + @property + def subtype(self): + return self.fields["left"] + + def __repr__(self): + return f"interval[{self.fields['left']}]" + + @classmethod + def from_arrow(cls, typ): + return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed) + + def to_arrow(self): + + return ArrowIntervalType( + pa.from_numpy_dtype(self.subtype), self.closed + ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2a1aed814fe..e763a164003 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2403,6 +2403,18 @@ def _copy_type_metadata( return self + def _copy_interval_data(self, other, include_index=True): + for name, col, other_col in zip( + self._data.keys(), self._data.values(), other._data.values() + ): + if isinstance(other_col, cudf.core.column.IntervalColumn): + self._data[name] = cudf.core.column.IntervalColumn(col) + + def _postprocess_columns(self, other, include_index=True): + self._copy_categories(other, include_index=include_index) + self._copy_struct_names(other, include_index=include_index) + self._copy_interval_data(other, include_index=include_index) + def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) data = zip(self._column_names, data_columns) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 32cecec3f60..b6e2aac0304 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -11,6 +11,7 @@ Decimal64Dtype, ListDtype, StructDtype, + IntervalDtype, ) from cudf.tests.utils import assert_eq @@ -145,3 +146,12 @@ def test_max_precision(): Decimal64Dtype(scale=0, precision=18) with pytest.raises(ValueError): Decimal64Dtype(scale=0, precision=19) + + +@pytest.mark.parametrize("fields", ["int64", "int32"]) +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +def test_interval_dtype_pyarrow_round_trip(fields, closed): + pa_array = pd.core.arrays._arrow_utils.ArrowIntervalType(fields, closed) + expect = pa_array + got = IntervalDtype.from_arrow(expect).to_arrow() + assert expect.equals(got) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py new file mode 100644 index 00000000000..c7eafedd409 --- /dev/null +++ b/python/cudf/cudf/tests/test_interval.py @@ -0,0 +1,132 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.tests.utils import assert_eq + + +@pytest.mark.parametrize( + "data1, data2", [(1, 2), (1.0, 2.0), (3, 4.0)], +) +@pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +def test_create_interval_series(data1, data2, data3, data4, closed): + + expect = pd.Series(pd.Interval(data1, data2, closed), dtype="interval") + got = cudf.Series(pd.Interval(data1, data2, closed), dtype="interval") + assert_eq(expect, got) + + expect_two = pd.Series( + [pd.Interval(data1, data2, closed), pd.Interval(data3, data4, closed)], + dtype="interval", + ) + got_two = cudf.Series( + [pd.Interval(data1, data2, closed), pd.Interval(data3, data4, closed)], + dtype="interval", + ) + assert_eq(expect_two, got_two) + + expect_three = pd.Series( + [ + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + ], + dtype="interval", + ) + got_three = cudf.Series( + [ + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + ], + dtype="interval", + ) + assert_eq(expect_three, got_three) + + +@pytest.mark.parametrize( + "data1, data2", [(1, 2), (1.0, 2.0), (3, 4.0)], +) +@pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +def test_create_interval_df(data1, data2, data3, data4, closed): + # df for both pandas and cudf only works when interval is in a list + expect = pd.DataFrame( + [pd.Interval(data1, data2, closed)], dtype="interval" + ) + got = cudf.DataFrame([pd.Interval(data1, data2, closed)], dtype="interval") + assert_eq(expect, got) + + expect_two = pd.DataFrame( + { + "a": [ + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + ], + "b": [ + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + ], + }, + dtype="interval", + ) + got_two = cudf.DataFrame( + { + "a": [ + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + ], + "b": [ + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + ], + }, + dtype="interval", + ) + assert_eq(expect_two, got_two) + + expect_three = pd.DataFrame( + { + "a": [ + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + ], + "b": [ + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + ], + "c": [ + pd.Interval(data1, data2, closed), + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + ], + }, + dtype="interval", + ) + + got_three = cudf.DataFrame( + { + "a": [ + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + ], + "b": [ + pd.Interval(data3, data4, closed), + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + ], + "c": [ + pd.Interval(data1, data2, closed), + pd.Interval(data1, data2, closed), + pd.Interval(data3, data4, closed), + ], + }, + dtype="interval", + ) + assert_eq(expect_three, got_three) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index d49b4abd399..274285990a6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -232,13 +232,25 @@ def is_list_dtype(obj): def is_struct_dtype(obj): return ( - type(obj) is cudf.core.dtypes.StructDtype + isinstance(obj, cudf.core.dtypes.StructDtype) or obj is cudf.core.dtypes.StructDtype or (isinstance(obj, str) and obj == cudf.core.dtypes.StructDtype.name) or (hasattr(obj, "dtype") and is_struct_dtype(obj.dtype)) ) +def is_interval_dtype(obj): + return ( + isinstance(obj, cudf.core.dtypes.IntervalDtype) + or isinstance(obj, pd.core.dtypes.dtypes.IntervalDtype) + or obj is cudf.core.dtypes.IntervalDtype + or ( + isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name + ) + or (hasattr(obj, "dtype") and is_interval_dtype(obj.dtype)) + ) + + def is_decimal_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal64Dtype