From 4b3aec0b7719067a1275694ede908a36ebf1eb88 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Jan 2024 10:25:59 -0800 Subject: [PATCH 1/8] Refactor IntervalIndex.__init__ --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/index.py | 53 ++++++++++--------- .../cudf/cudf/tests/indexes/test_interval.py | 21 +++++++- 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2aef77b6c99..2bc83c1789b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1911,7 +1911,7 @@ def drop_duplicates( nulls_are_equal: bool, default True Null elements are considered equal to other null elements. """ - + breakpoint() # This utilizes the fact that all `Index` is also a `Frame`. # Except RangeIndex. return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e012d8e7140..09a50b5ac10 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3224,39 +3224,44 @@ def __init__( copy=False, name=None, ): - if copy: - data = column.as_column(data, dtype=dtype).copy() - kwargs = _setdefault_name(data, name=name) + name = _setdefault_name(data, name=name) - if closed is None: - closed = "right" + if dtype is not None: + dtype = cudf.dtype(dtype) + if not isinstance(dtype, IntervalDtype): + raise TypeError("dtype must be an IntervalDtype") + if closed is not None and closed != dtype.closed: + raise ValueError("closed keyword does not match dtype.closed") + closed = dtype.closed - if isinstance(data, IntervalColumn): - data = data - elif isinstance(data, pd.Series) and isinstance( - data.dtype, pd.IntervalDtype - ): - data = column.as_column(data, data.dtype) - elif isinstance(data, (pd.Interval, pd.IntervalIndex)): - data = column.as_column( - data, - dtype=dtype, - ) + if closed is None and isinstance(dtype, IntervalDtype): + closed = dtype.closed + + closed = closed or "right" + + if dtype is None and len(data) == 0: + dtype = IntervalDtype(getattr(data, "dtype", "int64"), closed) + + data = as_column(data) + if not isinstance(data, IntervalColumn) and len(data) > 0: + raise TypeError("data must be an iterable of Interval data") elif len(data) == 0: subtype = getattr(data, "dtype", "int64") dtype = IntervalDtype(subtype, closed) - data = column.column_empty_like_same_mask( - column.as_column(data), dtype - ) - else: - data = column.as_column(data) - data.dtype.closed = closed + if copy: + data = data.copy() + + if dtype: + data = data.astype(dtype) self.closed = closed - super().__init__(data, **kwargs) + super().__init__(data, name=name) @_cudf_nvtx_annotate - def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): + @classmethod + def from_breaks( + cls, breaks, closed="right", name=None, copy=False, dtype=None + ): """ Construct an IntervalIndex from an array of splits. diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 52c49aebf35..8b214a9edff 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd import pyarrow as pa @@ -315,3 +315,22 @@ def test_intervalindex_empty_typed_non_int(): result = cudf.IntervalIndex(data) expected = pd.IntervalIndex(data) assert_eq(result, expected) + + +def test_intervalindex_invalid_dtype(): + with pytest.raises(TypeError): + cudf.IntervalIndex([pd.Interval(1, 2)], dtype="int64") + + +def test_intervalindex_conflicting_closed(): + with pytest.raises(TypeError): + cudf.IntervalIndex( + [pd.Interval(1, 2)], + dtype=cudf.IntervalDtype("int64", closed="left"), + closed="right", + ) + + +def test_intervalindex_invalid_data(): + with pytest.raises(TypeError): + cudf.IntervalIndex([1, 2]) From 4aa79e75ebd4861d55954b2030ef8399c1bf4989 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:39:57 -0800 Subject: [PATCH 2/8] Refactor and add validation to IntervalIndex.__init__ --- python/cudf/cudf/core/_base_index.py | 1 - python/cudf/cudf/core/column/column.py | 50 +---------- python/cudf/cudf/core/column/interval.py | 23 ++--- python/cudf/cudf/core/index.py | 87 ++++++++++++------- .../cudf/cudf/tests/indexes/test_interval.py | 10 +-- 5 files changed, 68 insertions(+), 103 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2bc83c1789b..b4aa9daa727 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1911,7 +1911,6 @@ def drop_duplicates( nulls_are_equal: bool, default True Null elements are considered equal to other null elements. """ - breakpoint() # This utilizes the fact that all `Index` is also a `Frame`. # Except RangeIndex. return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 705862c502a..12c2e7bfe1f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1001,14 +1001,14 @@ def astype( "`.astype('str')` instead." ) return col.as_string_column(dtype, format=format) + elif isinstance(dtype, IntervalDtype): + return col.as_interval_column(dtype) elif isinstance(dtype, (ListDtype, StructDtype)): if not col.dtype == dtype: raise NotImplementedError( f"Casting {self.dtype} columns not currently supported" ) return col - elif isinstance(dtype, IntervalDtype): - return col.as_interval_column(dtype) elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): return col.as_decimal_column(dtype) elif np.issubdtype(cast(Any, dtype), np.datetime64): @@ -1697,52 +1697,6 @@ def build_categorical_column( return cast("cudf.core.column.CategoricalColumn", result) -def build_interval_column( - left_col, - right_col, - mask=None, - size=None, - offset=0, - null_count=None, - closed="right", -): - """ - Build an IntervalColumn - - Parameters - ---------- - left_col : Column - Column of values representing the left of the interval - right_col : Column - Column of representing the right of the interval - mask : Buffer - Null mask - size : int, optional - offset : int, optional - closed : {"left", "right", "both", "neither"}, default "right" - Whether the intervals are closed on the left-side, right-side, - both or neither. - """ - left = as_column(left_col) - right = as_column(right_col) - if closed not in {"left", "right", "both", "neither"}: - closed = "right" - if type(left_col) is not list: - dtype = IntervalDtype(left_col.dtype, closed) - else: - dtype = IntervalDtype("int64", closed) - size = len(left) - return build_column( - data=None, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=(left, right), - ) - - def build_list_column( indices: ColumnBase, elements: ColumnBase, diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 6a7e7729123..b125bbc8a38 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -18,7 +18,6 @@ def __init__( offset=0, null_count=None, children=(), - closed="right", ): super().__init__( data=None, @@ -29,14 +28,6 @@ def __init__( null_count=null_count, children=children, ) - if closed in ["left", "right", "neither", "both"]: - self._closed = closed - else: - raise ValueError("closed value is not valid") - - @property - def closed(self): - return self._closed @classmethod def from_arrow(cls, data): @@ -50,7 +41,6 @@ def from_arrow(cls, data): offset = data.offset null_count = data.null_count children = new_col.children - closed = dtype.closed return IntervalColumn( size=size, @@ -59,7 +49,6 @@ def from_arrow(cls, data): offset=offset, null_count=null_count, children=children, - closed=closed, ) def to_arrow(self): @@ -83,20 +72,19 @@ def from_struct_column(cls, struct_column: StructColumn, closed="right"): offset=struct_column.offset, null_count=struct_column.null_count, children=struct_column.base_children, - closed=closed, ) def copy(self, deep=True): - closed = self.closed struct_copy = super().copy(deep=deep) return IntervalColumn( size=struct_copy.size, - dtype=IntervalDtype(struct_copy.dtype.fields["left"], closed), + dtype=IntervalDtype( + struct_copy.dtype.fields["left"], self.dtype.closed + ), mask=struct_copy.base_mask, offset=struct_copy.offset, null_count=struct_copy.null_count, children=struct_copy.base_children, - closed=closed, ) def as_interval_column(self, dtype): @@ -109,7 +97,7 @@ def as_interval_column(self, dtype): # when creating an interval series or interval dataframe if dtype == "interval": dtype = IntervalDtype( - self.dtype.fields["left"], self.closed + self.dtype.fields["left"], self.dtype.closed ) children = self.children return IntervalColumn( @@ -119,7 +107,6 @@ def as_interval_column(self, dtype): offset=self.offset, null_count=self.null_count, children=children, - closed=dtype.closed, ) else: raise ValueError("dtype must be IntervalDtype") @@ -141,5 +128,5 @@ def to_pandas( def element_indexing(self, index: int): result = super().element_indexing(index) if cudf.get_option("mode.pandas_compatible"): - return pd.Interval(**result, closed=self._closed) + return pd.Interval(**result, closed=self.dtype.closed) return result diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 09a50b5ac10..64adf93b063 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3176,10 +3176,12 @@ def interval_range( data = column.column_empty_like_same_mask(left_col, dtype) return IntervalIndex(data, closed=closed) - interval_col = column.build_interval_column( - left_col, right_col, closed=closed + interval_col = IntervalColumn( + dtype=IntervalDtype(left_col.dtype, closed), + size=len(left_col), + children=(left_col, right_col), ) - return IntervalIndex(interval_col) + return IntervalIndex(interval_col, closed=closed) class IntervalIndex(GenericIndex): @@ -3219,12 +3221,12 @@ class IntervalIndex(GenericIndex): def __init__( self, data, - closed=None, + closed: Optional[Literal["left", "right", "neither", "both"]] = None, dtype=None, - copy=False, + copy: bool = False, name=None, ): - name = _setdefault_name(data, name=name) + name = _setdefault_name(data, name=name)["name"] if dtype is not None: dtype = cudf.dtype(dtype) @@ -3239,28 +3241,51 @@ def __init__( closed = closed or "right" - if dtype is None and len(data) == 0: - dtype = IntervalDtype(getattr(data, "dtype", "int64"), closed) + if len(data) == 0: + if not hasattr(data, "dtype"): + data = np.array([], dtype=np.int64) + elif isinstance(data.dtype, (pd.IntervalDtype, IntervalDtype)): + data = np.array([], dtype=data.dtype.subtype) + interval_col = IntervalColumn( + dtype=IntervalDtype(data.dtype, closed), + size=len(data), + children=(as_column(data), as_column(data)), + ) + else: + col = as_column(data) + if not isinstance(col, IntervalColumn): + raise TypeError("data must be an iterable of Interval data") + if copy: + col = col.copy() + interval_col = IntervalColumn( + dtype=IntervalDtype(col.dtype.subtype, closed), + mask=col.mask, + size=col.size, + offset=col.offset, + null_count=col.null_count, + children=col.children, + ) - data = as_column(data) - if not isinstance(data, IntervalColumn) and len(data) > 0: - raise TypeError("data must be an iterable of Interval data") - elif len(data) == 0: - subtype = getattr(data, "dtype", "int64") - dtype = IntervalDtype(subtype, closed) + if dtype: + interval_col = interval_col.astype(dtype) # type: ignore[assignment] - if copy: - data = data.copy() + super().__init__(interval_col, name=name) - if dtype: - data = data.astype(dtype) - self.closed = closed - super().__init__(data, name=name) + @property + def closed(self): + return self._values.dtype.closed - @_cudf_nvtx_annotate @classmethod + @_cudf_nvtx_annotate def from_breaks( - cls, breaks, closed="right", name=None, copy=False, dtype=None + cls, + breaks, + closed: Optional[ + Literal["left", "right", "neither", "both"] + ] = "right", + name=None, + copy: bool = False, + dtype=None, ): """ Construct an IntervalIndex from an array of splits. @@ -3290,16 +3315,18 @@ def from_breaks( >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]') """ + breaks = column.as_column(breaks, dtype=dtype) if copy: - breaks = column.as_column(breaks, dtype=dtype).copy() - left_col = breaks[:-1:] - right_col = breaks[+1::] - - interval_col = column.build_interval_column( - left_col, right_col, closed=closed + breaks = breaks.copy() + left_col = breaks.slice(0, -1) + right_col = breaks.slice(1, len(breaks)) + + interval_col = IntervalColumn( + dtype=IntervalDtype(left_col.dtype, closed), + size=len(left_col), + children=(left_col, right_col), ) - - return IntervalIndex(interval_col, name=name) + return IntervalIndex(interval_col, name=name, closed=closed) def __getitem__(self, index): raise NotImplementedError( diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 8b214a9edff..5a6155ece29 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -57,11 +57,9 @@ def test_interval_range_dtype_basic(start_t, end_t): @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -@pytest.mark.parametrize("start", [0]) -@pytest.mark.parametrize("end", [0]) -def test_interval_range_empty(start, end, closed): - pindex = pd.interval_range(start=start, end=end, closed=closed) - gindex = cudf.interval_range(start=start, end=end, closed=closed) +def test_interval_range_empty(closed): + pindex = pd.interval_range(start=0, end=0, closed=closed) + gindex = cudf.interval_range(start=0, end=0, closed=closed) assert_eq(pindex, gindex) @@ -323,7 +321,7 @@ def test_intervalindex_invalid_dtype(): def test_intervalindex_conflicting_closed(): - with pytest.raises(TypeError): + with pytest.raises(ValueError): cudf.IntervalIndex( [pd.Interval(1, 2)], dtype=cudf.IntervalDtype("int64", closed="left"), From ca544879d9520c6fb5503cafd24d9b94c8d22484 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:40:53 -0800 Subject: [PATCH 3/8] whitspace --- python/cudf/cudf/core/_base_index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b4aa9daa727..45e570964ad 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1913,6 +1913,7 @@ def drop_duplicates( """ # This utilizes the fact that all `Index` is also a `Frame`. # Except RangeIndex. + return self._from_columns_like_self( drop_duplicates( list(self._columns), From cc951c0baa054d47a0d3beb9e46c2c73c201a06d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:43:49 -0800 Subject: [PATCH 4/8] Whitespace again --- python/cudf/cudf/core/_base_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 45e570964ad..2aef77b6c99 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1911,9 +1911,9 @@ def drop_duplicates( nulls_are_equal: bool, default True Null elements are considered equal to other null elements. """ + # This utilizes the fact that all `Index` is also a `Frame`. # Except RangeIndex. - return self._from_columns_like_self( drop_duplicates( list(self._columns), From 280672c17bac9a35da85df34e07eae445be5f8ec Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:53:30 -0800 Subject: [PATCH 5/8] Invalid closed arg --- python/cudf/cudf/tests/test_udf_masked_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index ad0c961a749..11970944a95 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import math import operator @@ -636,7 +636,7 @@ def func(row): ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) ), cudf.Series([1, 2, 3], dtype="category"), - cudf.interval_range(start=0, end=3, closed=True), + cudf.interval_range(start=0, end=3), [[1, 2], [3, 4], [5, 6]], [{"a": 1}, {"a": 2}, {"a": 3}], ], From 1b03911bc5b89755146293b37d6c5efb352af882 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:23:11 -0800 Subject: [PATCH 6/8] Simplify some IntervalColumn logic --- python/cudf/cudf/core/column/interval.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index b125bbc8a38..ae0b88f9f20 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -62,7 +62,7 @@ def to_arrow(self): @classmethod def from_struct_column(cls, struct_column: StructColumn, closed="right"): - first_field_name = list(struct_column.dtype.fields.keys())[0] + first_field_name = next(iter(struct_column.dtype.fields.keys())) return IntervalColumn( size=struct_column.size, dtype=IntervalDtype( @@ -78,9 +78,7 @@ def copy(self, deep=True): struct_copy = super().copy(deep=deep) return IntervalColumn( size=struct_copy.size, - dtype=IntervalDtype( - struct_copy.dtype.fields["left"], self.dtype.closed - ), + dtype=IntervalDtype(struct_copy.dtype.subtype, self.dtype.closed), mask=struct_copy.base_mask, offset=struct_copy.offset, null_count=struct_copy.null_count, @@ -97,7 +95,7 @@ def as_interval_column(self, dtype): # when creating an interval series or interval dataframe if dtype == "interval": dtype = IntervalDtype( - self.dtype.fields["left"], self.dtype.closed + self.dtype.subtype, self.dtype.closed ) children = self.children return IntervalColumn( From fc5ad9fc5105a3b3963316d4ec8edf39e96f66f7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:08:55 -0800 Subject: [PATCH 7/8] Revert field change --- python/cudf/cudf/core/column/interval.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index ae0b88f9f20..7227ef8ba3a 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -78,7 +78,9 @@ def copy(self, deep=True): struct_copy = super().copy(deep=deep) return IntervalColumn( size=struct_copy.size, - dtype=IntervalDtype(struct_copy.dtype.subtype, self.dtype.closed), + dtype=IntervalDtype( + struct_copy.dtype.fields["left"], self.dtype.closed + ), mask=struct_copy.base_mask, offset=struct_copy.offset, null_count=struct_copy.null_count, From 2fbcd3d3757787c25a98c1ba5368a593718fd7dc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jan 2024 16:09:53 -0800 Subject: [PATCH 8/8] Children columns should have 0 offset --- python/cudf/cudf/core/index.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c19c5c04106..c10124f4de6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3313,11 +3313,21 @@ def from_breaks( >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]') """ - breaks = column.as_column(breaks, dtype=dtype) + breaks = as_column(breaks, dtype=dtype) if copy: breaks = breaks.copy() - left_col = breaks.slice(0, -1) + left_col = breaks.slice(0, len(breaks) - 1) right_col = breaks.slice(1, len(breaks)) + # For indexing, children should both have 0 offset + right_col = column.build_column( + data=right_col.data, + dtype=right_col.dtype, + size=right_col.size, + mask=right_col.mask, + offset=0, + null_count=right_col.null_count, + children=right_col.children, + ) interval_col = IntervalColumn( dtype=IntervalDtype(left_col.dtype, closed),