From 04330f2e9e73ac71a86666c55d0fe7248eaf8db6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:23:07 -1000 Subject: [PATCH 01/11] Fix convert_dtypes with convert_integer=False/convert_floating=True (#15964) If `convert_integer=False`, there should be no attempt to convert to integer Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15964 --- python/cudf/cudf/core/indexed_frame.py | 34 +++++++++++-------- .../cudf/cudf/tests/series/test_conversion.py | 13 +++++++ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 63fa96d0db0..30b68574960 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6235,13 +6235,13 @@ def rank( def convert_dtypes( self, - infer_objects=True, - convert_string=True, - convert_integer=True, - convert_boolean=True, - convert_floating=True, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, dtype_backend=None, - ): + ) -> Self: """ Convert columns to the best possible nullable dtypes. @@ -6252,17 +6252,21 @@ def convert_dtypes( All other dtypes are always returned as-is as all dtypes in cudf are nullable. """ - result = self.copy() - - if convert_floating: - # cast any floating columns to int64 if - # they are all integer data: - for name, col in result._data.items(): + if not (convert_floating and convert_integer): + return self.copy() + else: + cols = [] + for col in self._columns: if col.dtype.kind == "f": col = col.fillna(0) - if cp.allclose(col, col.astype("int64")): - result._data[name] = col.astype("int64") - return result + as_int = col.astype("int64") + if cp.allclose(col, as_int): + cols.append(as_int) + continue + cols.append(col) + return self._from_data_like_self( + self._data._from_columns_like_self(cols, verify=False) + ) @_warn_no_dask_cudf def __dask_tokenize__(self): diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py index e1dd359e1ba..1d680d7860d 100644 --- a/python/cudf/cudf/tests/series/test_conversion.py +++ b/python/cudf/cudf/tests/series/test_conversion.py @@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype): assert_eq(expect, got) +def test_convert_integer_false_convert_floating_true(): + data = [1.000000000000000000000000001, 1] + expected = pd.Series(data).convert_dtypes( + convert_integer=False, convert_floating=True + ) + result = ( + cudf.Series(data) + .convert_dtypes(convert_integer=False, convert_floating=True) + .to_pandas(nullable=True) + ) + assert_eq(result, expected) + + # Now write the same test, but construct a DataFrame # as input instead of parametrizing: From dba46e7a8957b8389b69e820485e319a1d314017 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 15:21:50 -1000 Subject: [PATCH 02/11] Replace is_datetime/timedelta_dtype checks with .kind checks (#16262) It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16262 --- python/cudf/cudf/_fuzz_testing/utils.py | 3 +- python/cudf/cudf/core/column/datetime.py | 7 +-- python/cudf/cudf/core/column/timedelta.py | 4 +- python/cudf/cudf/core/dataframe.py | 7 ++- python/cudf/cudf/core/scalar.py | 8 +--- python/cudf/cudf/core/tools/numeric.py | 9 +--- python/cudf/cudf/tests/test_binops.py | 7 +-- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_list.py | 7 +-- python/cudf/cudf/tests/test_scalar.py | 11 +---- python/cudf/cudf/utils/dtypes.py | 56 ++++++++--------------- 11 files changed, 37 insertions(+), 86 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index e6dfe2eae62..8ce92e1c0f6 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -192,8 +192,7 @@ def convert_nulls_to_none(records, df): col for col in df.columns if df[col].dtype in pandas_dtypes_to_np_dtypes - or pd.api.types.is_datetime64_dtype(df[col].dtype) - or pd.api.types.is_timedelta64_dtype(df[col].dtype) + or df[col].dtype.kind in "mM" ] for record in records: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 214e84028d2..409c44f6eee 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,7 +18,6 @@ from cudf import _lib as libcudf from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted -from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, @@ -565,10 +564,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # We check this on `other` before reflection since we already know the # dtype of `self`. - other_is_timedelta = is_timedelta64_dtype(other.dtype) - other_is_datetime64 = not other_is_timedelta and is_datetime64_dtype( - other.dtype - ) + other_is_timedelta = other.dtype.kind == "m" + other_is_datetime64 = other.dtype.kind == "M" lhs, rhs = (other, self) if reflect else (self, other) out_dtype = None diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2cbed9212de..36d7d9f9614 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -12,7 +12,7 @@ import cudf from cudf import _lib as libcudf -from cudf.api.types import is_scalar, is_timedelta64_dtype +from cudf.api.types import is_scalar from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype @@ -153,7 +153,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: this: ColumnBinaryOperand = self out_dtype = None - if is_timedelta64_dtype(other.dtype): + if other.dtype.kind == "m": # TODO: pandas will allow these operators to work but return false # when comparing to non-timedelta dtypes. We should do the same. if op in { diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f110b788789..2aa1b95e2d1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -33,7 +33,6 @@ from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, - is_datetime_dtype, is_dict_like, is_dtype_equal, is_list_like, @@ -6113,7 +6112,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): else: filtered = self.copy(deep=False) - is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes) + is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes) common_dtype = find_common_type(filtered.dtypes) if ( @@ -6510,7 +6509,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): cudf.utils.dtypes.get_min_float_dtype( prepared._data[col] ) - if not is_datetime_dtype(common_dtype) + if common_dtype.kind != "M" else cudf.dtype("float64") ) .fillna(np.nan) @@ -6537,7 +6536,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_dtype = ( common_dtype if method in type_coerced_methods - or is_datetime_dtype(common_dtype) + or (common_dtype is not None and common_dtype.kind == "M") else None ) result = column.as_column(result, dtype=result_dtype) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 29460d8c67e..f6331aa1f49 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype +from cudf.api.types import is_scalar from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand @@ -245,11 +245,7 @@ def _preprocess_host_value(self, value, dtype): dtype = cudf.dtype(dtype) if not valid: - value = ( - NaT - if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) - else NA - ) + value = NaT if dtype.kind in "mM" else NA return value, dtype diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index ef6b86a04a7..466d46f7dca 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,12 +8,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib import strings as libstrings -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_datetime_dtype, - is_string_dtype, - is_timedelta_dtype, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import can_convert_to_column @@ -114,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None): col = as_column(arg) dtype = col.dtype - if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): + if dtype.kind in "mM": col = col.astype(cudf.dtype("int64")) elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 5265278db4c..503b1a975b4 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1694,12 +1694,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): rhs = cudf.Scalar(cudf.NA, dtype=dtype_r) result = op(lhs, rhs) - assert result.value is ( - cudf.NaT - if cudf.api.types.is_datetime64_dtype(result.dtype) - or cudf.api.types.is_timedelta64_dtype(result.dtype) - else cudf.NA - ) + assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA) # make sure dtype is the same as had there been a valid scalar valid_lhs = cudf.Scalar(1, dtype=dtype_l) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f40106a30f4..7ccf83e424c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5457,9 +5457,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - if not numeric_only and not all( - cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes - ): + if not numeric_only and not all(dt.kind == "M" for dt in gdf.dtypes): with pytest.raises(TypeError): got = getattr(gdf, op)( axis=1, skipna=skipna, numeric_only=numeric_only diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index ec9d7995b05..36bcaa66d7d 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -694,12 +694,7 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level): dtype = cudf.ListDtype(dtype) slr = cudf.Scalar(None, dtype=dtype) - assert slr.value is ( - cudf.NaT - if cudf.api.types.is_datetime64_dtype(slr.dtype) - or cudf.api.types.is_timedelta64_dtype(slr.dtype) - else cudf.NA - ) + assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 195231e9960..f2faf4343b6 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -212,9 +212,7 @@ def test_scalar_roundtrip(value): ) def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) - if cudf.api.types.is_datetime64_dtype( - dtype - ) or cudf.api.types.is_timedelta64_dtype(dtype): + if s.dtype.kind in "mM": assert s.value is cudf.NaT else: assert s.value is cudf.NA @@ -369,12 +367,7 @@ def test_scalar_implicit_int_conversion(value): @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"})) def test_scalar_invalid_implicit_conversion(cls, dtype): try: - cls( - pd.NaT - if cudf.api.types.is_datetime64_dtype(dtype) - or cudf.api.types.is_timedelta64_dtype(dtype) - else pd.NA - ) + cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA) except TypeError as e: with pytest.raises(TypeError, match=re.escape(str(e))): slr = cudf.Scalar(None, dtype=dtype) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 0dec857ea96..59e5ec1df04 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -424,9 +424,7 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): dtype = cudf.dtype(dtype) - if pd.api.types.is_datetime64_dtype( - dtype - ) or pd.api.types.is_timedelta64_dtype(dtype): + if dtype.kind in "mM": time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) elif dtype.kind == "f": @@ -527,16 +525,14 @@ def find_common_type(dtypes): return cudf.dtype("O") # Aggregate same types - dtypes = set(dtypes) + dtypes = {cudf.dtype(dtype) for dtype in dtypes} + if len(dtypes) == 1: + return dtypes.pop() if any( isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes ): - if all( - cudf.api.types.is_decimal_dtype(dtype) - or cudf.api.types.is_numeric_dtype(dtype) - for dtype in dtypes - ): + if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes): return _find_common_type_decimal( [ dtype @@ -546,40 +542,28 @@ def find_common_type(dtypes): ) else: return cudf.dtype("O") - if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes): - if len(dtypes) == 1: - return dtypes.get(0) - else: - # TODO: As list dtypes allow casting - # to identical types, improve this logic of returning a - # common dtype, for example: - # ListDtype(int64) & ListDtype(int32) common - # dtype could be ListDtype(int64). - raise NotImplementedError( - "Finding a common type for `ListDtype` is currently " - "not supported" - ) - if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes): - if len(dtypes) == 1: - return dtypes.get(0) - else: - raise NotImplementedError( - "Finding a common type for `StructDtype` is currently " - "not supported" - ) + elif any( + isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)) + for dtype in dtypes + ): + # TODO: As list dtypes allow casting + # to identical types, improve this logic of returning a + # common dtype, for example: + # ListDtype(int64) & ListDtype(int32) common + # dtype could be ListDtype(int64). + raise NotImplementedError( + "Finding a common type for `ListDtype` or `StructDtype` is currently " + "not supported" + ) # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately - dt_dtypes = set( - filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes) - ) + dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes)) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) - td_dtypes = set( - filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes) - ) + td_dtypes = set(filter(lambda t: t.kind == "m", dtypes)) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) From 47a0a87db454cc767ab5f74beb2198a480d6f2c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:13:29 -1000 Subject: [PATCH 03/11] Type & reduce cupy usage (#16277) There are some cupy usages that don't seem _strictly_ necessary (generating starting data, array type conversion) in some APIs. IMO we should prefer using CPU data/the existing data structure/Column ops over cupy when possible closes https://github.com/rapidsai/cudf/issues/12133 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16277 --- python/cudf/cudf/core/_base_index.py | 4 ++- python/cudf/cudf/core/column/column.py | 8 +++--- python/cudf/cudf/core/column/datetime.py | 6 ++-- python/cudf/cudf/core/column/numerical.py | 10 ++----- python/cudf/cudf/core/cut.py | 6 ++-- python/cudf/cudf/core/dataframe.py | 18 +++++++----- python/cudf/cudf/core/frame.py | 6 ++-- python/cudf/cudf/core/groupby/groupby.py | 23 ++++++++------- python/cudf/cudf/core/index.py | 34 ++++++++++++----------- python/cudf/cudf/core/multiindex.py | 13 +++++---- python/cudf/cudf/core/tools/datetimes.py | 9 +++--- python/cudf/cudf/tests/test_datetime.py | 15 ++-------- 12 files changed, 74 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e160fa697ee..9ba2d161619 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -38,6 +38,8 @@ if TYPE_CHECKING: from collections.abc import Generator + import cupy + from cudf.core.column_accessor import ColumnAccessor @@ -2001,7 +2003,7 @@ def drop_duplicates( self._column_names, ) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> cupy.ndarray: """ Indicate duplicate index values. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f633d527681..fd3664ecac4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase: return result def indices_of( - self, value: ScalarLike | Self + self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: """ Find locations of value in the column @@ -735,10 +735,10 @@ def indices_of( ------- Column of indices that match value """ - if not isinstance(value, ColumnBase): - value = as_column([value], dtype=self.dtype) + if not is_scalar(value): + raise ValueError("value must be a scalar") else: - assert len(value) == 1 + value = as_column(value, dtype=self.dtype, length=1) mask = libcudf.search.contains(value, self) return apply_boolean_mask( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 409c44f6eee..004a059af95 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -629,9 +629,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: - value = column.as_column( - pd.to_datetime(value), dtype=self.dtype - ).astype("int64") + value = ( + pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64") + ) return self.astype("int64").indices_of(value) @property diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b8fa00e9643..7f05a5f91a1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -5,7 +5,6 @@ import functools from typing import TYPE_CHECKING, Any, Callable, Sequence, cast -import cupy as cp import numpy as np import pandas as pd from typing_extensions import Self @@ -13,7 +12,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib import pylibcudf -from cudf._lib.types import size_type_dtype from cudf.api.types import ( is_bool_dtype, is_float_dtype, @@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: and self.dtype.kind in {"c", "f"} and np.isnan(value) ): - return column.as_column( - cp.argwhere( - cp.isnan(self.data_array_view(mode="read")) - ).flatten(), - dtype=size_type_dtype, - ) + nan_col = libcudf.unary.is_nan(self) + return nan_col.indices_of(True) else: return super().indices_of(value) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index d9f62f51f92..197f46ee9fe 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -188,9 +188,6 @@ def cut( # adjust bin edges decimal precision int_label_bins = np.around(bins, precision) - # the inputs is a column of the values in the array x - input_arr = as_column(x) - # checking for the correct inclusivity values if right: closed = "right" @@ -242,6 +239,9 @@ def cut( labels if len(set(labels)) == len(labels) else None ) + # the inputs is a column of the values in the array x + input_arr = as_column(x) + if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns # we cannot typecast an IntervalIndex, so we need to diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2aa1b95e2d1..2121e623c1c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -429,7 +429,7 @@ def _setitem_tuple_arg(self, key, value): else: value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: # If the inner dimension is 1, it's broadcastable to # all columns of the dataframe. indexed_shape = columns_df.loc[key[0]].shape @@ -566,7 +566,7 @@ def _setitem_tuple_arg(self, key, value): # TODO: consolidate code path with identical counterpart # in `_DataFrameLocIndexer._setitem_tuple_arg` value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: indexed_shape = columns_df.iloc[key[0]].shape if value.shape[1] == 1: if value.shape[0] != indexed_shape[0]: @@ -2199,8 +2199,8 @@ def from_dict( orient = orient.lower() if orient == "index": - if len(data) > 0 and isinstance( - next(iter(data.values())), (cudf.Series, cupy.ndarray) + if isinstance( + next(iter(data.values()), None), (cudf.Series, cupy.ndarray) ): result = cls(data).T result.columns = ( @@ -5698,7 +5698,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): @classmethod @_performance_tracking - def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): + def _from_arrays( + cls, + data: np.ndarray | cupy.ndarray, + index=None, + columns=None, + nan_as_null=False, + ): """Convert a numpy/cupy array to DataFrame. Parameters @@ -5716,8 +5722,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): ------- DataFrame """ - - data = cupy.asarray(data) if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found: {data.ndim}" diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 253d200f7d4..802751e47ad 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1189,7 +1189,7 @@ def searchsorted( side: Literal["left", "right"] = "left", ascending: bool = True, na_position: Literal["first", "last"] = "last", - ): + ) -> ScalarLike | cupy.ndarray: """Find indices where elements should be inserted to maintain order Parameters @@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @acquire_spill_lock() def _apply_cupy_ufunc_to_operands( self, ufunc, cupy_func, operands, **kwargs - ): + ) -> list[dict[Any, ColumnBase]]: # Note: There are some operations that may be supported by libcudf but # are not supported by pandas APIs. In particular, libcudf binary # operations support logical and/or operations as well as @@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands( # without cupy. mask = None - data = [{} for _ in range(ufunc.nout)] + data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)] for name, (left, right, _, _) in operands.items(): cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index eccb3acabf6..8659d7c2392 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -35,7 +35,12 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + from cudf._typing import ( + AggType, + DataFrameOrSeries, + MultiColumnAggType, + ScalarLike, + ) def _deprecate_collect(): @@ -357,7 +362,7 @@ def groups(self): ) @cached_property - def indices(self): + def indices(self) -> dict[ScalarLike, cp.ndarray]: """ Dict {group name -> group indices}. @@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True): if ascending: # Count ascending from 0 to num_groups - 1 - group_ids = cudf.Series._from_data({None: cp.arange(num_groups)}) + groups = range(num_groups) elif has_null_group: # Count descending from num_groups - 1 to 0, but subtract one more # for the null group making it num_groups - 2 to -1. - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 2, -2, -1)} - ) + groups = range(num_groups - 2, -2, -1) else: # Count descending from num_groups - 1 to 0 - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 1, -1, -1)} - ) + groups = range(num_groups - 1, -1, -1) + + group_ids = cudf.Series._from_data({None: as_column(groups)}) if has_null_group: group_ids.iloc[-1] = cudf.NA @@ -1713,7 +1716,7 @@ def rolling_avg(val, avg): return grouped_values.apply_chunks(function, **kwargs) @_performance_tracking - def _broadcast(self, values): + def _broadcast(self, values: cudf.Series) -> cudf.Series: """ Broadcast the results of an aggregation to the group diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b398ee2343e..4164f981fca 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( idx: Index | cudf.MultiIndex, - key_as_table: Frame, + keys: list[ColumnBase], is_sorted: bool, ) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index @@ -118,13 +118,13 @@ def _lexsorted_equal_range( sort_vals = idx lower_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="right", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) @@ -260,7 +260,9 @@ def searchsorted( ), "Invalid ascending flag" return search_range(value, self._range, side=side) - def factorize(self, sort: bool = False, use_na_sentinel: bool = True): + def factorize( + self, sort: bool = False, use_na_sentinel: bool = True + ) -> tuple[cupy.ndarray, Self]: if sort and self.step < 0: codes = cupy.arange(len(self) - 1, -1, -1) uniques = self[::-1] @@ -753,15 +755,16 @@ def difference(self, other, sort=None): super().difference(other, sort=sort) ) - def _try_reconstruct_range_index(self, index): - if isinstance(index, RangeIndex) or index.dtype.kind == "f": + def _try_reconstruct_range_index( + self, index: BaseIndex + ) -> Self | BaseIndex: + if isinstance(index, RangeIndex) or index.dtype.kind not in "iu": return index # Evenly spaced values can return a # RangeIndex instead of a materialized Index. - if not index._column.has_nulls(): + if not index._column.has_nulls(): # type: ignore[attr-defined] uniques = cupy.unique(cupy.diff(index.values)) - if len(uniques) == 1 and uniques[0].get() != 0: - diff = uniques[0].get() + if len(uniques) == 1 and (diff := uniques[0].get()) != 0: new_range = range(index[0], index[-1] + diff, diff) return type(self)(new_range, name=index.name) return index @@ -1309,7 +1312,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _return_get_indexer_result(result_series.to_cupy()) @_performance_tracking - def get_loc(self, key): + def get_loc(self, key) -> int | slice | cupy.ndarray: if not is_scalar(key): raise TypeError("Should be a scalar-like") @@ -1317,9 +1320,8 @@ def get_loc(self, key): self.is_monotonic_increasing or self.is_monotonic_decreasing ) - target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, target_as_table, is_sorted + self, [as_column([key])], is_sorted ) if lower_bound == upper_bound: @@ -1330,7 +1332,7 @@ def get_loc(self, key): return ( lower_bound if is_sorted - else sort_inds.element_indexing(lower_bound) + else sort_inds.element_indexing(lower_bound) # type: ignore[union-attr] ) if is_sorted: @@ -1339,8 +1341,8 @@ def get_loc(self, key): return slice(lower_bound, upper_bound) # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).values + mask = cupy.full(len(self), False) + true_inds = sort_inds.slice(lower_bound, upper_bound).values # type: ignore[union-attr] mask[true_inds] = True return mask @@ -2076,7 +2078,7 @@ def day_of_year(self): @property # type: ignore @_performance_tracking - def is_leap_year(self): + def is_leap_year(self) -> cupy.ndarray: """ Boolean indicator if the date belongs to a leap year. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6503dae6ff5..3ed72ff812a 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1926,17 +1926,18 @@ def get_loc(self, key): # Handle partial key search. If length of `key` is less than `nlevels`, # Only search levels up to `len(key)` level. - key_as_table = cudf.core.frame.Frame( - {i: column.as_column(k, length=1) for i, k in enumerate(key)} - ) partial_index = self.__class__._from_data( - data=self._data.select_by_index(slice(key_as_table._num_columns)) + data=self._data.select_by_index(slice(len(key))) ) ( lower_bound, upper_bound, sort_inds, - ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) + ) = _lexsorted_equal_range( + partial_index, + [column.as_column(k, length=1) for k in key], + is_sorted, + ) if lower_bound == upper_bound: raise KeyError(key) @@ -1961,7 +1962,7 @@ def get_loc(self, key): return true_inds # Not sorted and not unique. Return a boolean mask - mask = cp.full(self._data.nrows, False) + mask = cp.full(len(self), False) mask[true_inds] = True return mask diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 064e8fc667d..c6e2b5d10e1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -6,7 +6,6 @@ import warnings from typing import Literal, Sequence -import cupy as cp import numpy as np import pandas as pd import pandas.tseries.offsets as pd_offset @@ -894,7 +893,7 @@ def date_range( # integers and divide the number range evenly with `periods` elements. start = cudf.Scalar(start, dtype=dtype).value.astype("int64") end = cudf.Scalar(end, dtype=dtype).value.astype("int64") - arr = cp.linspace(start=start, stop=end, num=periods) + arr = np.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz) @@ -991,8 +990,10 @@ def date_range( stop = end_estim.astype("int64") start = start.value.astype("int64") step = _offset_to_nanoseconds_lower_bound(offset) - arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") - res = cudf.core.column.as_column(arr).astype("datetime64[ns]") + arr = range(int(start), int(stop), step) + res = cudf.core.column.as_column(arr, dtype="int64").astype( + "datetime64[ns]" + ) return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize( tz diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 092e9790c63..7ab9ff2ef23 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods): ) -def test_date_range_start_end_freq(request, start, end, freq): - request.applymarker( - pytest.mark.xfail( - condition=( - start == "1831-05-08 15:23:21" - and end == "1996-11-21 04:05:30" - and freq == "110546789ms" - ), - reason="https://github.com/rapidsai/cudf/issues/12133", - ) - ) - +def test_date_range_start_end_freq(start, end, freq): if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq): ) -def test_date_range_start_freq_periods(request, start, freq, periods): +def test_date_range_start_freq_periods(start, freq, periods): if isinstance(freq, str): _gfreq = _pfreq = freq else: From beda22ed28030bbed2faaa5a49509255f11976aa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:29:05 -1000 Subject: [PATCH 04/11] Replace is_bool_type with checking .dtype.kind (#16255) It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16255 --- python/cudf/cudf/core/_base_index.py | 9 +++------ python/cudf/cudf/core/_internals/where.py | 8 ++------ python/cudf/cudf/core/column/column.py | 7 +++---- python/cudf/cudf/core/column/numerical.py | 5 ++--- python/cudf/cudf/core/dataframe.py | 13 ++++++------- python/cudf/cudf/core/groupby/groupby.py | 4 ++-- python/cudf/cudf/core/indexing_utils.py | 3 +-- python/cudf/cudf/core/multiindex.py | 4 ---- python/cudf/cudf/core/series.py | 11 +++++------ python/cudf/cudf/core/single_column_frame.py | 3 +-- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_index.py | 5 ++--- 12 files changed, 28 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 9ba2d161619..479f87bb78b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -20,7 +20,6 @@ from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( - is_bool_dtype, is_integer, is_integer_dtype, is_list_like, @@ -610,10 +609,8 @@ def union(self, other, sort=None): ) if cudf.get_option("mode.pandas_compatible"): - if ( - is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype) - ) or ( - not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype) + if (self.dtype.kind == "b" and other.dtype.kind != "b") or ( + self.dtype.kind != "b" and other.dtype.kind == "b" ): # Bools + other types will result in mixed type. # This is not yet consistent in pandas and specific to APIs. @@ -2154,7 +2151,7 @@ def _apply_boolean_mask(self, boolean_mask): Rows corresponding to `False` is dropped. """ boolean_mask = cudf.core.column.as_column(boolean_mask) - if not is_bool_dtype(boolean_mask.dtype): + if boolean_mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index f3183e6029d..4a36be76b6d 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -7,11 +7,7 @@ import numpy as np import cudf -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_scalar, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, @@ -112,7 +108,7 @@ def _check_and_cast_columns_with_other( other = cudf.Scalar(other) if is_mixed_with_object_dtype(other, source_col) or ( - is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype) + source_dtype.kind == "b" and common_dtype.kind != "b" ): raise TypeError(mixed_err) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fd3664ecac4..dbdf501e022 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -41,7 +41,6 @@ _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, - is_bool_dtype, is_dtype_equal, is_scalar, is_string_dtype, @@ -619,7 +618,7 @@ def _scatter_by_column( key: cudf.core.column.NumericalColumn, value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": # `key` is boolean mask if len(key) != len(self): raise ValueError( @@ -644,7 +643,7 @@ def _scatter_by_column( self._check_scatter_key_length(num_keys, value) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return libcudf.copying.boolean_mask_scatter([value], [self], key)[ 0 ]._with_type_metadata(self.dtype) @@ -1083,7 +1082,7 @@ def as_decimal_column( def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask) - if not is_bool_dtype(mask.dtype): + if mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return apply_boolean_mask([self], mask)[0]._with_type_metadata( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7f05a5f91a1..cea68c88c90 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -13,7 +13,6 @@ from cudf import _lib as libcudf from cudf._lib import pylibcudf from cudf.api.types import ( - is_bool_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -159,7 +158,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value) ) - if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype): + if self.dtype.kind != "b" and device_value.dtype.kind == "b": raise TypeError(f"Invalid value {value} for dtype {self.dtype}") else: device_value = device_value.astype(self.dtype) @@ -264,7 +263,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: f"{self.dtype.type.__name__} and " f"{other.dtype.type.__name__}" ) - if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype): + if self.dtype.kind == "b" or other.dtype.kind == "b": out_dtype = "bool" if ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2121e623c1c..b3d938829c9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -32,7 +32,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_dtype_equal, is_list_like, @@ -171,7 +170,7 @@ def _can_downcast_to_series(self, df, arg): ): return False else: - if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance( + if as_column(arg[0]).dtype.kind == "b" and not isinstance( arg[1], slice ): return True @@ -320,7 +319,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg[1], ) - if is_bool_dtype(tmp_arg[0].dtype): + if tmp_arg[0].dtype.kind == "b": df = columns_df._apply_boolean_mask( BooleanMask(tmp_arg[0], len(columns_df)) ) @@ -3678,8 +3677,8 @@ def agg(self, aggs, axis=None): """ dtypes = [self[col].dtype for col in self._column_names] common_dtype = find_common_type(dtypes) - if not is_bool_dtype(common_dtype) and any( - is_bool_dtype(dtype) for dtype in dtypes + if common_dtype.kind != "b" and any( + dtype.kind == "b" for dtype in dtypes ): raise MixedTypeError("Cannot create a column with mixed types") @@ -6305,8 +6304,8 @@ def _reduce( and any( not is_object_dtype(dtype) for dtype in source_dtypes ) - or not is_bool_dtype(common_dtype) - and any(is_bool_dtype(dtype) for dtype in source_dtypes) + or common_dtype.kind != "b" + and any(dtype.kind == "b" for dtype in source_dtypes) ): raise TypeError( "Columns must all have the same dtype to " diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8659d7c2392..d2c75715be2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,7 +22,7 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype +from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -1534,7 +1534,7 @@ def mult(df): # For `sum` & `product`, boolean types # will need to result in `int64` type. for name, col in res._data.items(): - if is_bool_dtype(col.dtype): + if col.dtype.kind == "b": res._data[name] = col.astype("int") return res diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index a5fed02cbed..9c81b0eb607 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -10,7 +10,6 @@ import cudf from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, is_integer_dtype, ) @@ -230,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): key = key.astype(key.codes.dtype) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: return EmptyIndexer() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3ed72ff812a..ff4b06c6334 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -841,10 +841,6 @@ def _get_row_major( | tuple[Any, ...] | list[tuple[Any, ...]], ) -> DataFrameOrSeries: - if pd.api.types.is_bool_dtype( - list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple - ): - return df[row_tuple] if isinstance(row_tuple, slice): if row_tuple.start is None: row_tuple = slice(self[0], row_tuple.stop, row_tuple.step) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8c8fa75918c..e12cc3d52fb 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -22,7 +22,6 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_integer, is_integer_dtype, @@ -221,10 +220,10 @@ def __setitem__(self, key, value): f"Cannot assign {value=} to " f"non-float dtype={self._frame.dtype}" ) - elif ( - self._frame.dtype.kind == "b" - and not is_bool_dtype(value) - and value not in {None, cudf.NA} + elif self._frame.dtype.kind == "b" and not ( + value in {None, cudf.NA} + or isinstance(value, (np.bool_, bool)) + or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b") ): raise MixedTypeError( f"Cannot assign {value=} to " @@ -3221,7 +3220,7 @@ def describe( percentiles = np.array([0.25, 0.5, 0.75]) dtype = "str" - if is_bool_dtype(self.dtype): + if self.dtype.kind == "b": data = _describe_categorical(self, percentiles) elif isinstance(self._column, cudf.core.column.NumericalColumn): data = _describe_numeric(self, percentiles) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9555aee6a2..04c7db7a53c 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -11,7 +11,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, is_integer_dtype, is_numeric_dtype, @@ -361,7 +360,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: arg = cudf.core.column.column_empty(0, dtype="int32") if is_integer_dtype(arg.dtype): return self._column.take(arg) - if is_bool_dtype(arg.dtype): + if arg.dtype.kind == "b": if (bn := len(arg)) != (n := len(self)): raise IndexError( f"Boolean mask has wrong length: {bn} not {n}" diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7ccf83e424c..2009fc49ce5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only): else (pdf[column].notna().count() == 0) ) or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or cudf.api.types.is_bool_dtype(pdf[column].dtype) + or pdf[column].dtype.kind == "b" for column in pdf ): with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05dcd85df6a..9eba6122d26 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -16,7 +16,6 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible): expected, actual, exact=False - if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype)) - or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype)) + if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") + or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") else True, ) From 669db3ea4a0c24a343c5619dd00904ad22ea215b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 16 Jul 2024 14:24:58 +0100 Subject: [PATCH 05/11] Fix logic in to_arrow for empty list column (#16279) An empty list column need not have empty children, it just needs to have zero length. In this case, the offsets array will have zero length, and we need to create a temporary buffer. Now that this branch runs, fix two errors in the construction of the arrow array: 1. The element type, if there are children, should be taken from the child array; 2. If the child arrays are empty, we must make an empty null array, rather than passing a null pointer as the values array, otherwise we hit a segfault inside arrow. The previous fix in #16201 correctly handled the empty children case (except for point two), but not the first case, which we do here. Since we we're previously going down this code path (child_arrays was never empty), we never hit the latent segfault from point two. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - David Wendt (https://github.com/davidwendt) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/16279 --- cpp/src/interop/to_arrow.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 8c4be1b50a5..622a3aba4bb 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -378,13 +378,11 @@ std::shared_ptr dispatch_to_arrow::operator()( auto children_meta = metadata.children_meta.empty() ? std::vector{{}, {}} : metadata.children_meta; auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream); - if (child_arrays.empty()) { - // Empty list will have only one value in offset of 4 bytes - auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr); - memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t)); - - return std::make_shared( - arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr); + if (child_arrays.empty() || child_arrays[0]->data()->length == 0) { + auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type(); + auto result = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr); + CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n"); + return result.ValueUnsafe(); } auto offset_buffer = child_arrays[0]->data()->buffers[1]; From a6de6cc23702ed71b80625f461a90e910a33642f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 16 Jul 2024 15:42:12 +0100 Subject: [PATCH 06/11] Introduce version file so we can conditionally handle things in tests (#16280) We decided we would attempt to support a range of versions back to 1.0. We'll test with oldest and newest versions we support. To facilitate, introduce some versioning constants. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/16280 --- python/cudf_polars/cudf_polars/dsl/ir.py | 7 ++++- .../cudf_polars/cudf_polars/utils/versions.py | 28 +++++++++++++++++++ python/cudf_polars/tests/test_scan.py | 5 ++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/cudf_polars/utils/versions.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 5e6544ef77c..cce0c4a3d94 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -313,7 +313,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if row_index is not None: + if ( + row_index is not None + # TODO: remove condition when dropping support for polars 1.0 + # https://github.com/pola-rs/polars/pull/17363 + and row_index[0] in self.schema + ): name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py new file mode 100644 index 00000000000..a9ac14c25aa --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Version utilities so that cudf_polars supports a range of polars versions.""" + +# ruff: noqa: SIM300 +from __future__ import annotations + +from packaging.version import parse + +from polars import __version__ + +POLARS_VERSION = parse(__version__) + +POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") +POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") +POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") +POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") +POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") +POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") + +POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") +POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") +POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") +POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") + +if POLARS_VERSION < parse("1.0"): # pragma: no cover + raise ImportError("cudf_polars requires py-polars v1.0 or greater.") diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index c41a94da14b..d0c41090433 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -10,6 +10,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils import versions @pytest.fixture( @@ -97,6 +98,10 @@ def test_scan_unsupported_raises(tmp_path): assert_ir_translation_raises(q, NotImplementedError) +@pytest.mark.xfail( + versions.POLARS_VERSION_LT_11, + reason="https://github.com/pola-rs/polars/issues/15730", +) def test_scan_row_index_projected_out(tmp_path): df = pl.DataFrame({"a": [1, 2, 3]}) From 3418f915d1a1ff82a72918d978924dfad2645a5a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 16 Jul 2024 12:38:47 -0500 Subject: [PATCH 07/11] Introduce dedicated options for low memory readers (#16289) This PR disables low memory readers by default in `cudf.pandas` and instead gives a provision to enable them with dedicated options. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16289 --- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/options.py | 26 ++++++++++++++++++++++++++ python/cudf/cudf/tests/test_json.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 2 +- 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 853dd431099..03bf9ed8b75 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -99,7 +99,7 @@ cpdef read_json(object filepaths_or_buffers, else: raise TypeError("`dtype` must be 'list like' or 'dict'") - if cudf.get_option("mode.pandas_compatible") and lines: + if cudf.get_option("io.json.low_memory") and lines: res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json( plc.io.SourceInfo(filepaths_or_buffers), processed_dtypes, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index fd0792b5edb..02b26ea1c01 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -916,7 +916,7 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) - if cudf.get_option("mode.pandas_compatible"): + if cudf.get_option("io.parquet.low_memory"): return libparquet.ParquetReader( filepaths_or_buffers, columns=columns, diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 1f539e7f266..94e73021cec 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -325,6 +325,32 @@ def _integer_and_none_validator(val): _make_contains_validator([False, True]), ) +_register_option( + "io.parquet.low_memory", + False, + textwrap.dedent( + """ + If set to `False`, reads entire parquet in one go. + If set to `True`, reads parquet file in chunks. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + +_register_option( + "io.json.low_memory", + False, + textwrap.dedent( + """ + If set to `False`, reads entire json in one go. + If set to `True`, reads json file in chunks. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + class option_context(ContextDecorator): """ diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 7771afd692f..c81c2d1d94b 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1441,6 +1441,6 @@ def test_chunked_json_reader(): df.to_json(buf, lines=True, orient="records", engine="cudf") buf.seek(0) df = df.to_pandas() - with cudf.option_context("mode.pandas_compatible", True): + with cudf.option_context("io.json.low_memory", True): gdf = cudf.read_json(buf, lines=True) assert_eq(df, gdf) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ff0c9040737..ecb7fd44422 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3772,6 +3772,6 @@ def test_parquet_reader_pandas_compatibility(): ) buffer = BytesIO() df.to_parquet(buffer) - with cudf.option_context("mode.pandas_compatible", True): + with cudf.option_context("io.parquet.low_memory", True): expected = cudf.read_parquet(buffer) assert_eq(expected, df) From e2b7e4370c8513811e9c72b30f499a5614b49f7c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Tue, 16 Jul 2024 14:20:00 -0400 Subject: [PATCH 08/11] Build and test with CUDA 12.5.1 (#16259) This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1. Contributes to https://github.com/rapidsai/build-planning/issues/73 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - James Lamb (https://github.com/jameslamb) - https://github.com/jakirkham URL: https://github.com/rapidsai/cudf/pull/16259 --- .../cuda12.2-conda/devcontainer.json | 8 ++-- .devcontainer/cuda12.2-pip/devcontainer.json | 10 ++-- .github/workflows/build.yaml | 20 ++++---- .github/workflows/pandas-tests.yaml | 4 +- .github/workflows/pr.yaml | 48 +++++++++---------- .../workflows/pr_issue_status_automation.yml | 6 +-- .github/workflows/test.yaml | 22 ++++----- CONTRIBUTING.md | 2 +- README.md | 2 +- ..._64.yaml => all_cuda-125_arch-x86_64.yaml} | 4 +- dependencies.yaml | 6 ++- 11 files changed, 68 insertions(+), 64 deletions(-) rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (97%) diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 05bf9173d25..fadce01d060 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -3,7 +3,7 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04" } @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { @@ -20,7 +20,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -29,7 +29,7 @@ "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index 74420214726..026eb540952 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -3,15 +3,15 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { @@ -20,7 +20,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -28,7 +28,7 @@ "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 2e5959338b0..937080572ad 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -101,7 +101,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -111,7 +111,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index a8643923a4d..1516cb09449 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,9 +17,9 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ceee9074b93..1fe64e7f318 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -34,41 +34,41 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: pull-request script: "ci/test_python_cudf.sh" @@ -76,14 +76,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: pull-request script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -93,7 +93,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -103,7 +103,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -113,7 +113,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -123,21 +123,21 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-cudf-polars: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-tests-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -157,7 +157,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -166,7 +166,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -174,10 +174,10 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1 with: arch: '["amd64"]' - cuda: '["12.2"]' + cuda: '["12.5"]' build_command: | sccache -z; build-all -DBUILD_BENCHMARKS=ON --verbose; @@ -185,7 +185,7 @@ jobs: unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request @@ -194,9 +194,9 @@ jobs: # run the Pandas unit tests using PR branch needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. @@ -204,7 +204,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 8ca971dc28d..2a8ebd30993 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 36c9088d93c..73f8d726e77 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,7 +45,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +85,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +97,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4fbc28fa6e1..f9cdde7c2b7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -104,7 +104,7 @@ Instructions for a minimal build environment without conda are included below. # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml +conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` diff --git a/README.md b/README.md index 17d2df9a936..1ab6a2d7457 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.08 python=3.11 cuda-version=12.2 + cudf=24.08 python=3.11 cuda-version=12.5 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml similarity index 97% rename from conda/environments/all_cuda-122_arch-x86_64.yaml rename to conda/environments/all_cuda-125_arch-x86_64.yaml index c32d21c5d36..3f5fae49cbb 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -23,7 +23,7 @@ dependencies: - cuda-nvtx-dev - cuda-python>=12.0,<13.0a0 - cuda-sanitizer-api -- cuda-version=12.2 +- cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 @@ -96,4 +96,4 @@ dependencies: - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master -name: all_cuda-122_arch-x86_64 +name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 27621ff9a3f..67ed3773b44 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,7 +3,7 @@ files: all: output: conda matrix: - cuda: ["11.8", "12.2"] + cuda: ["11.8", "12.5"] arch: [x86_64] includes: - build_base @@ -402,6 +402,10 @@ dependencies: cuda: "12.2" packages: - cuda-version=12.2 + - matrix: + cuda: "12.5" + packages: + - cuda-version=12.5 cuda: specific: - output_types: conda From 05ea7c9cf6a0fd39384e2044b4c9b46f543d4ad0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:51:20 -0700 Subject: [PATCH 09/11] Fix tests for polars 1.2 (#16292) Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16292 --- python/cudf_polars/tests/test_groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 50adca01950..b07d8e38217 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -12,6 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils import versions @pytest.fixture @@ -100,7 +101,7 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): with pytest.raises(AssertionError): # https://github.com/pola-rs/polars/issues/17556 assert_gpu_result_equal(q, check_exact=False) - if schema[sort_keys[1]] == pl.Boolean(): + if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): # https://github.com/pola-rs/polars/issues/17557 with pytest.raises(AssertionError): assert_gpu_result_equal(qsorted, check_exact=False) From 62191103032706371d76ce83c6ec59d13376b231 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:23:49 -0400 Subject: [PATCH 10/11] [BUG] Make name attr of Index fast slow attrs (#16270) Debugging the spike in failures from #16234 Authors: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16270 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 36 ++++++++------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index d3a3488081a..59a243dd7c4 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -260,18 +260,14 @@ def Index__new__(cls, *args, **kwargs): return self -def name(self): - return self._fsproxy_wrapped._name - - def Index__setattr__(self, name, value): if name.startswith("_"): object.__setattr__(self, name, value) return if name == "name": - setattr(self._fsproxy_wrapped, "_name", value) + setattr(self._fsproxy_wrapped, "name", value) if name == "names": - setattr(self._fsproxy_wrapped, "_names", value) + setattr(self._fsproxy_wrapped, "names", value) return _FastSlowAttribute("__setattr__").__get__(self, type(self))( name, value ) @@ -300,7 +296,7 @@ def Index__setattr__(self, name, value): "_accessors": set(), "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -314,7 +310,7 @@ def Index__setattr__(self, name, value): additional_attributes={ "__init__": _DELETE, "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -345,7 +341,7 @@ def Index__setattr__(self, name, value): additional_attributes={ "__init__": _DELETE, "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -375,10 +371,10 @@ def Index__setattr__(self, name, value): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -412,10 +408,10 @@ def Index__setattr__(self, name, value): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -470,10 +466,10 @@ def Index__setattr__(self, name, value): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -508,10 +504,6 @@ def Index__setattr__(self, name, value): ) -def names(self): - return self._fsproxy_wrapped._names - - MultiIndex = make_final_proxy_type( "MultiIndex", cudf.MultiIndex, @@ -522,7 +514,7 @@ def names(self): additional_attributes={ "__init__": _DELETE, "__setattr__": Index__setattr__, - "name": property(names), + "names": _FastSlowAttribute("names"), }, ) @@ -709,10 +701,10 @@ def names(self): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) From 6a954e299d97f69a62fd184529fa7d5f29c0e09f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:02:20 -0700 Subject: [PATCH 11/11] Migrate expressions to pylibcudf (#16056) xref #15162 Migrates expresions to use pylibcudf. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16056 --- .../api_docs/pylibcudf/datetime.rst | 6 +- .../api_docs/pylibcudf/expressions.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 3 +- python/cudf/cudf/_lib/expressions.pyx | 156 -------------- python/cudf/cudf/_lib/parquet.pyx | 2 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 1 + .../cudf/_lib/{ => pylibcudf}/expressions.pxd | 29 ++- .../cudf/cudf/_lib/pylibcudf/expressions.pyx | 195 ++++++++++++++++++ .../_lib/pylibcudf/libcudf/CMakeLists.txt | 4 +- .../_lib/pylibcudf/libcudf/expressions.pxd | 103 ++++----- .../_lib/pylibcudf/libcudf/expressions.pyx | 0 python/cudf/cudf/_lib/transform.pyx | 2 +- .../cudf/cudf/core/_internals/expressions.py | 11 +- .../cudf/pylibcudf_tests/test_expressions.py | 50 +++++ 18 files changed, 335 insertions(+), 237 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst delete mode 100644 python/cudf/cudf/_lib/expressions.pyx rename python/cudf/cudf/_lib/{ => pylibcudf}/expressions.pxd (50%) create mode 100644 python/cudf/cudf/_lib/pylibcudf/expressions.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_expressions.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst index ebf5fab3052..558268ea495 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst @@ -1,6 +1,6 @@ -======= -copying -======= +======== +datetime +======== .. automodule:: cudf._lib.pylibcudf.datetime :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst new file mode 100644 index 00000000000..03f769ee861 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst @@ -0,0 +1,6 @@ +=========== +expressions +=========== + +.. automodule:: cudf._lib.pylibcudf.expressions + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 5899d272160..505765bba0f 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -15,6 +15,7 @@ This page provides API documentation for pylibcudf. concatenate copying datetime + expressions filling gpumemoryview groupby diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 5a067e84f56..38b7e9ebe04 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -21,7 +21,6 @@ set(cython_sources copying.pyx csv.pyx datetime.pyx - expressions.pyx filling.pyx groupby.pyx hash.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 18b95f5f2e1..34c0e29d0b1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np from . import ( @@ -8,7 +8,6 @@ copying, csv, datetime, - expressions, filling, groupby, hash, diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx deleted file mode 100644 index 3fb29279ed7..00000000000 --- a/python/cudf/cudf/_lib/expressions.pyx +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from enum import Enum - -import numpy as np - -from cython.operator cimport dereference -from libc.stdint cimport int64_t -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport ( - timestamp_ms, - timestamp_us, -) - -# Necessary for proper casting, see below. -ctypedef int32_t underlying_type_ast_operator - - -# Aliases for simplicity -ctypedef unique_ptr[libcudf_exp.expression] expression_ptr - - -class ASTOperator(Enum): - ADD = libcudf_exp.ast_operator.ADD - SUB = libcudf_exp.ast_operator.SUB - MUL = libcudf_exp.ast_operator.MUL - DIV = libcudf_exp.ast_operator.DIV - TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV - FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV - MOD = libcudf_exp.ast_operator.MOD - PYMOD = libcudf_exp.ast_operator.PYMOD - POW = libcudf_exp.ast_operator.POW - EQUAL = libcudf_exp.ast_operator.EQUAL - NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL - NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL - LESS = libcudf_exp.ast_operator.LESS - GREATER = libcudf_exp.ast_operator.GREATER - LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL - GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL - BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND - BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR - BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR - LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND - NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND - LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR - NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR - # Unary operators - IDENTITY = libcudf_exp.ast_operator.IDENTITY - IS_NULL = libcudf_exp.ast_operator.IS_NULL - SIN = libcudf_exp.ast_operator.SIN - COS = libcudf_exp.ast_operator.COS - TAN = libcudf_exp.ast_operator.TAN - ARCSIN = libcudf_exp.ast_operator.ARCSIN - ARCCOS = libcudf_exp.ast_operator.ARCCOS - ARCTAN = libcudf_exp.ast_operator.ARCTAN - SINH = libcudf_exp.ast_operator.SINH - COSH = libcudf_exp.ast_operator.COSH - TANH = libcudf_exp.ast_operator.TANH - ARCSINH = libcudf_exp.ast_operator.ARCSINH - ARCCOSH = libcudf_exp.ast_operator.ARCCOSH - ARCTANH = libcudf_exp.ast_operator.ARCTANH - EXP = libcudf_exp.ast_operator.EXP - LOG = libcudf_exp.ast_operator.LOG - SQRT = libcudf_exp.ast_operator.SQRT - CBRT = libcudf_exp.ast_operator.CBRT - CEIL = libcudf_exp.ast_operator.CEIL - FLOOR = libcudf_exp.ast_operator.FLOOR - ABS = libcudf_exp.ast_operator.ABS - RINT = libcudf_exp.ast_operator.RINT - BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT - NOT = libcudf_exp.ast_operator.NOT - - -class TableReference(Enum): - LEFT = libcudf_exp.table_reference.LEFT - RIGHT = libcudf_exp.table_reference.RIGHT - - -# Note that this function only currently supports numeric literals. libcudf -# expressions don't really support other types yet though, so this isn't -# restrictive at the moment. -cdef class Literal(Expression): - def __cinit__(self, value): - if isinstance(value, int): - self.c_scalar.reset(new numeric_scalar[int64_t](value, True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, float): - self.c_scalar.reset(new numeric_scalar[double](value, True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, str): - self.c_scalar.reset(new string_scalar(value.encode(), True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, np.datetime64): - scale, _ = np.datetime_data(value.dtype) - int_value = value.astype(np.int64) - if scale == "ms": - self.c_scalar.reset(new timestamp_scalar[timestamp_ms]( - int_value, True) - ) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif scale == "us": - self.c_scalar.reset(new timestamp_scalar[timestamp_us]( - int_value, True) - ) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - else: - raise NotImplementedError( - f"Unhandled datetime scale {scale=}" - ) - else: - raise NotImplementedError( - f"Don't know how to make literal with type {type(value)}" - ) - - -cdef class ColumnReference(Expression): - def __cinit__(self, size_type index): - self.c_obj = move(make_unique[libcudf_exp.column_reference]( - index - )) - - -cdef class Operation(Expression): - def __cinit__(self, op, Expression left, Expression right=None): - cdef libcudf_exp.ast_operator op_value = ( - op.value - ) - - if right is None: - self.c_obj = move(make_unique[libcudf_exp.operation]( - op_value, dereference(left.c_obj) - )) - else: - self.c_obj = move(make_unique[libcudf_exp.operation]( - op_value, dereference(left.c_obj), dereference(right.c_obj) - )) - -cdef class ColumnNameReference(Expression): - def __cinit__(self, string name): - self.c_obj = \ - move(make_unique[libcudf_exp.column_name_reference](name)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 158fb6051c3..e7959d21e01 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types cimport cudf._lib.pylibcudf.libcudf.types as cudf_types from cudf._lib.column cimport Column -from cudf._lib.expressions cimport Expression from cudf._lib.io.utils cimport ( make_sinks_info, make_source_info, update_struct_field_names, ) +from cudf._lib.pylibcudf.expressions cimport Expression from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource from cudf._lib.pylibcudf.libcudf.expressions cimport expression from cudf._lib.pylibcudf.libcudf.io.parquet cimport ( diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index a2d11bbea6e..0800fa18e94 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -20,6 +20,7 @@ set(cython_sources concatenate.pyx copying.pyx datetime.pyx + expressions.pyx filling.pyx gpumemoryview.pyx groupby.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index da2b7806203..26e89b818d3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( concatenate, copying, datetime, + expressions, filling, groupby, join, diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index acbc84d7177..e89a5ed9f96 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -7,6 +7,7 @@ concatenate, copying, datetime, + expressions, filling, groupby, interop, diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd similarity index 50% rename from python/cudf/cudf/_lib/expressions.pxd rename to python/cudf/cudf/_lib/pylibcudf/expressions.pxd index 4a20c5fc545..64825b89d9f 100644 --- a/python/cudf/cudf/_lib/expressions.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd @@ -1,36 +1,31 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t, int64_t +# Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from cudf._lib.pylibcudf.libcudf.expressions cimport ( - column_reference, + ast_operator, expression, - literal, - operation, -) -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( - numeric_scalar, - scalar, - string_scalar, - timestamp_scalar, + table_reference, ) +from .scalar cimport Scalar + cdef class Expression: cdef unique_ptr[expression] c_obj - cdef class Literal(Expression): - cdef unique_ptr[scalar] c_scalar - + # Hold on to input scalar so it doesn't get gc'ed + cdef Scalar scalar cdef class ColumnReference(Expression): pass - cdef class Operation(Expression): - pass + # Hold on to the input expressions so + # they don't get gc'ed + cdef Expression right + cdef Expression left cdef class ColumnNameReference(Expression): pass diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx new file mode 100644 index 00000000000..38de11406ad --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx @@ -0,0 +1,195 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.expressions import \ + ast_operator as ASTOperator # no-cython-lint +from cudf._lib.pylibcudf.libcudf.expressions import \ + table_reference as TableReference # no-cython-lint + +from cython.operator cimport dereference +from libc.stdint cimport int32_t, int64_t +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( + duration_scalar, + numeric_scalar, + string_scalar, + timestamp_scalar, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id +from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport ( + duration_ms, + duration_ns, + duration_s, + duration_us, +) +from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport ( + timestamp_ms, + timestamp_ns, + timestamp_s, + timestamp_us, +) + +from .scalar cimport Scalar +from .traits cimport is_chrono, is_numeric +from .types cimport DataType + +# Aliases for simplicity +ctypedef unique_ptr[libcudf_exp.expression] expression_ptr + +cdef class Literal(Expression): + """ + A literal value used in an abstract syntax tree. + + For details, see :cpp:class:`cudf::ast::literal`. + + Parameters + ---------- + value : Scalar + The Scalar value of the Literal. + Must be either numeric, string, or a timestamp/duration scalar. + """ + def __cinit__(self, Scalar value): + self.scalar = value + cdef DataType typ = value.type() + cdef type_id tid = value.type().id() + if not (is_numeric(typ) or is_chrono(typ) or tid == type_id.STRING): + raise ValueError( + "Only numeric, string, or timestamp/duration scalars are accepted" + ) + # TODO: Accept type-erased scalar in AST C++ code + # Then a lot of this code can be deleted + if tid == type_id.INT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.INT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.FLOAT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.FLOAT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.STRING: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_NANOSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MICROSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_SECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_NANOSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MICROSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_SECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + else: + raise NotImplementedError( + f"Don't know how to make literal with type id {tid}" + ) + +cdef class ColumnReference(Expression): + """ + An expression referring to data from a column in a table. + + For details, see :cpp:class:`cudf::ast::column_reference`. + + Parameters + ---------- + index : size_type + The index of this column in the table + (provided when the expression is evaluated). + table_source : TableReference, default TableReferenece.LEFT + Which table to use in cases with two tables (e.g. joins) + """ + def __cinit__( + self, + size_type index, + table_reference table_source=table_reference.LEFT + ): + self.c_obj = move(make_unique[libcudf_exp.column_reference]( + index, table_source + )) + + +cdef class Operation(Expression): + """ + An operation expression holds an operator and zero or more operands. + + For details, see :cpp:class:`cudf::ast::operation`. + + Parameters + ---------- + op : Operator + left : Expression + Left input expression (left operand) + right: Expression, default None + Right input expression (right operand). + You should only pass this if the input expression is a binary operation. + """ + def __cinit__(self, ast_operator op, Expression left, Expression right=None): + self.left = left + self.right = right + if right is None: + self.c_obj = move(make_unique[libcudf_exp.operation]( + op, dereference(left.c_obj) + )) + else: + self.c_obj = move(make_unique[libcudf_exp.operation]( + op, dereference(left.c_obj), dereference(right.c_obj) + )) + +cdef class ColumnNameReference(Expression): + """ + An expression referring to data from a column in a table. + + For details, see :cpp:class:`cudf::ast::column_name_reference`. + + Parameters + ---------- + column_name : str + Name of this column in the table metadata + (provided when the expression is evaluated). + """ + def __cinit__(self, str name): + self.c_obj = \ + move(make_unique[libcudf_exp.column_name_reference]( + (name.encode("utf-8")) + )) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 699e85ce567..b04e94f1546 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx - stream_compaction.pyx types.pyx unary.pyx +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx + round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd index 279d969db50..427e16d4ff8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -14,63 +15,63 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: - ctypedef enum ast_operator: + cpdef enum class ast_operator(int32_t): # Binary operators - ADD "cudf::ast::ast_operator::ADD" - SUB "cudf::ast::ast_operator::SUB" - MUL "cudf::ast::ast_operator::MUL" - DIV "cudf::ast::ast_operator::DIV" - TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV" - FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV" - MOD "cudf::ast::ast_operator::MOD" - PYMOD "cudf::ast::ast_operator::PYMOD" - POW "cudf::ast::ast_operator::POW" - EQUAL "cudf::ast::ast_operator::EQUAL" - NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" - NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" - LESS "cudf::ast::ast_operator::LESS" - GREATER "cudf::ast::ast_operator::GREATER" - LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL" - GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL" - BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" - BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" - BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" - NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" - LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" - NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" - LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" + ADD + SUB + MUL + DIV + TRUE_DIV + FLOOR_DIV + MOD + PYMOD + POW + EQUAL + NULL_EQUAL + NOT_EQUAL + LESS + GREATER + LESS_EQUAL + GREATER_EQUAL + BITWISE_AND + BITWISE_OR + BITWISE_XOR + NULL_LOGICAL_AND + LOGICAL_AND + NULL_LOGICAL_OR + LOGICAL_OR # Unary operators - IDENTITY "cudf::ast::ast_operator::IDENTITY" - IS_NULL "cudf::ast::ast_operator::IS_NULL" - SIN "cudf::ast::ast_operator::SIN" - COS "cudf::ast::ast_operator::COS" - TAN "cudf::ast::ast_operator::TAN" - ARCSIN "cudf::ast::ast_operator::ARCSIN" - ARCCOS "cudf::ast::ast_operator::ARCCOS" - ARCTAN "cudf::ast::ast_operator::ARCTAN" - SINH "cudf::ast::ast_operator::SINH" - COSH "cudf::ast::ast_operator::COSH" - TANH "cudf::ast::ast_operator::TANH" - ARCSINH "cudf::ast::ast_operator::ARCSINH" - ARCCOSH "cudf::ast::ast_operator::ARCCOSH" - ARCTANH "cudf::ast::ast_operator::ARCTANH" - EXP "cudf::ast::ast_operator::EXP" - LOG "cudf::ast::ast_operator::LOG" - SQRT "cudf::ast::ast_operator::SQRT" - CBRT "cudf::ast::ast_operator::CBRT" - CEIL "cudf::ast::ast_operator::CEIL" - FLOOR "cudf::ast::ast_operator::FLOOR" - ABS "cudf::ast::ast_operator::ABS" - RINT "cudf::ast::ast_operator::RINT" - BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" - NOT "cudf::ast::ast_operator::NOT" + IDENTITY + IS_NULL + SIN + COS + TAN + ARCSIN + ARCCOS + ARCTAN + SINH + COSH + TANH + ARCSINH + ARCCOSH + ARCTANH + EXP + LOG + SQRT + CBRT + CEIL + FLOOR + ABS + RINT + BIT_INVERT + NOT cdef cppclass expression: pass - ctypedef enum table_reference: - LEFT "cudf::ast::table_reference::LEFT" - RIGHT "cudf::ast::table_reference::RIGHT" + cpdef enum class table_reference(int32_t): + LEFT + RIGHT cdef cppclass literal(expression): # Due to https://github.com/cython/cython/issues/3198, we need to diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 86a4a60eef1..622725e06a3 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -19,8 +19,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform from cudf._lib.column cimport Column -from cudf._lib.expressions cimport Expression from cudf._lib.pylibcudf cimport transform as plc_transform +from cudf._lib.pylibcudf.expressions cimport Expression from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.expressions cimport expression diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 393a68dd844..63714a78572 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -4,7 +4,10 @@ import ast import functools -from cudf._lib.expressions import ( +import pyarrow as pa + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.expressions import ( ASTOperator, ColumnReference, Expression, @@ -122,7 +125,9 @@ def visit_Constant(self, node): f"Unsupported literal {repr(node.value)} of type " "{type(node.value).__name__}" ) - self.stack.append(Literal(node.value)) + self.stack.append( + Literal(plc.interop.from_arrow(pa.scalar(node.value))) + ) def visit_UnaryOp(self, node): self.visit(node.operand) @@ -132,7 +137,7 @@ def visit_UnaryOp(self, node): # operand, so there's no way to know whether this should be a float # or an int. We should maybe see what Spark does, and this will # probably require casting. - self.nodes.append(Literal(-1)) + self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1)))) op = ASTOperator.MUL self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) elif isinstance(node.op, ast.UAdd): diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/cudf/cudf/pylibcudf_tests/test_expressions.py new file mode 100644 index 00000000000..f661512caad --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_expressions.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + +# We can't really evaluate these expressions, so just make sure +# construction works properly + + +def test_literal_construction_invalid(): + with pytest.raises(ValueError): + plc.expressions.Literal( + plc.interop.from_arrow(pa.scalar(None, type=pa.list_(pa.int64()))) + ) + + +@pytest.mark.parametrize( + "tableref", + [ + plc.expressions.TableReference.LEFT, + plc.expressions.TableReference.RIGHT, + ], +) +def test_columnref_construction(tableref): + plc.expressions.ColumnReference(1.0, tableref) + + +def test_columnnameref_construction(): + plc.expressions.ColumnNameReference("abc") + + +@pytest.mark.parametrize( + "kwargs", + [ + # Unary op + { + "op": plc.expressions.ASTOperator.IDENTITY, + "left": plc.expressions.ColumnReference(1), + }, + # Binop + { + "op": plc.expressions.ASTOperator.ADD, + "left": plc.expressions.ColumnReference(1), + "right": plc.expressions.ColumnReference(2), + }, + ], +) +def test_astoperation_construction(kwargs): + plc.expressions.Operation(**kwargs)