From 1889c7c0f517c95143016a6e391275144a034f7a Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 15 Jul 2024 20:32:15 +0200 Subject: [PATCH 01/17] MAINT: Adapt to NumPy 2 promotion changes (#16141) Splitting out the non API changes from gh-15897, the Scalar API change is required for the tests to pass with NumPy 2, but almost all changes should be relatively straight forward here on their own. (I will add inline comments.) --- This PR does not fix integer comparisons, there are currently no tests that run into these. xref: https://github.com/rapidsai/build-planning/issues/38 Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16141 --- python/cudf/cudf/core/_internals/where.py | 24 +++++++++++------- python/cudf/cudf/core/column/categorical.py | 4 ++- python/cudf/cudf/core/column/numerical.py | 27 ++++++++++++++++----- python/cudf/cudf/tests/test_binops.py | 21 +++++++++++++--- python/cudf/cudf/tests/test_doctests.py | 13 +++++++++- python/cudf/cudf/tests/test_dtypes.py | 1 - 6 files changed, 69 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 44ce0ddef25..f3183e6029d 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -54,13 +54,17 @@ def _check_and_cast_columns_with_other( other_is_scalar = is_scalar(other) if other_is_scalar: - if (isinstance(other, float) and not np.isnan(other)) and ( - source_dtype.type(other) != other - ): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {source_dtype.name}" - ) + if isinstance(other, float) and not np.isnan(other): + try: + is_safe = source_dtype.type(other) == other + except OverflowError: + is_safe = False + + if not is_safe: + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {source_dtype.name}" + ) if cudf.utils.utils.is_na_like(other): return _normalize_categorical( @@ -84,8 +88,10 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast( - other, source_dtype + if ( + _is_non_decimal_numeric_dtype(source_dtype) + and not other_is_scalar # can-cast fails for Python scalars + and _can_cast(other, source_dtype) ): common_dtype = source_dtype elif ( diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f763d3b4b0c..9aaccca349d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -47,7 +47,9 @@ ) -_DEFAULT_CATEGORICAL_VALUE = -1 +# Using np.int8(-1) to allow silent wrap-around when casting to uint +# it may make sense to make this dtype specific or a function. +_DEFAULT_CATEGORICAL_VALUE = np.int8(-1) class CategoricalAccessor(ColumnMethods): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a0550bff72b..b8fa00e9643 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -301,15 +301,28 @@ def normalize_binop_value( if isinstance(other, cudf.Scalar): if self.dtype == other.dtype: return other + # expensive device-host transfer just to # adjust the dtype other = other.value + + # NumPy 2 needs a Python scalar to do weak promotion, but + # pandas forces weak promotion always + # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies. + if other.dtype.kind in "ifc": + other = other.item() + elif not isinstance(other, (int, float, complex)): + # Go via NumPy to get the value + other = np.array(other) + if other.dtype.kind in "ifc": + other = other.item() + # Try and match pandas and hence numpy. Deduce the common - # dtype via the _value_ of other, and the dtype of self. TODO: - # When NEP50 is accepted, this might want changed or - # simplified. - # This is not at all simple: - # np.result_type(np.int64(0), np.uint8) + # dtype via the _value_ of other, and the dtype of self on NumPy 1.x + # with NumPy 2, we force weak promotion even for our/NumPy scalars + # to match pandas 2.2. + # Weak promotion is not at all simple: + # np.result_type(0, np.uint8) # => np.uint8 # np.result_type(np.asarray([0], dtype=np.int64), np.uint8) # => np.int64 @@ -626,7 +639,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: min_, max_ = iinfo.min, iinfo.max # best we can do is hope to catch it here and avoid compare - if (self.min() >= min_) and (self.max() <= max_): + # Use Python floats, which have precise comparison for float64. + # NOTE(seberg): it would make sense to limit to the mantissa range. + if (float(self.min()) >= min_) and (float(self.max()) <= max_): filled = self.fillna(0) return (cudf.Series(filled) % 1 == 0).all() else: diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 7d8c3b53115..5265278db4c 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -539,7 +539,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class): if obj_class == "Index": gs = Index(gs) - gs_result = func(gs) + try: + gs_result = func(gs) + except OverflowError: + # An error is fine, if pandas raises the same error: + with pytest.raises(OverflowError): + func(random_series) + + return # class typing if obj_class == "Index": @@ -589,7 +596,14 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): if obj_class == "Index": gs = Index(gs) - gs_result = gpu_func(gs) + try: + gs_result = gpu_func(gs) + except OverflowError: + # An error is fine, if pandas raises the same error: + with pytest.raises(OverflowError): + cpu_func(random_series) + + return # class typing if obj_class == "Index": @@ -770,7 +784,8 @@ def test_operator_func_series_and_scalar( fill_value=fill_value, ) pdf_series_result = getattr(pdf_series, func)( - scalar, fill_value=fill_value + np.array(scalar)[()] if use_cudf_scalar else scalar, + fill_value=fill_value, ) assert_eq(pdf_series_result, gdf_series_result) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 0da5c6b04d6..794660cffcb 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import contextlib import doctest import inspect @@ -8,6 +8,7 @@ import numpy as np import pytest +from packaging import version import cudf @@ -80,6 +81,16 @@ def chdir_to_tmp_path(cls, tmp_path): yield os.chdir(original_directory) + @pytest.fixture(autouse=True) + def prinoptions(cls): + # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should + # be adapted evantually. + if version.parse(np.__version__) >= version.parse("2.0"): + with np.printoptions(legacy="1.25"): + yield + else: + yield + @pytest.mark.parametrize( "docstring", itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index edb534a3618..c62b5889fdd 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -341,7 +341,6 @@ def test_dtype(in_dtype, expect): np.complex128, complex, "S", - "a", "V", "float16", np.float16, From 128f0c917bbc3342f9eca12ca2bf714c88206256 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 15 Jul 2024 20:34:14 +0200 Subject: [PATCH 02/17] API: Check for integer overflows when creating scalar form python int (#16140) This aligns with NumPy, which deprecated this since a while and raises an error now on NumPy 2, for example for `Scalar(-1, dtype=np.uint8)`. Since it aligns with NumPy, the DeprecationWarning of earlier NumPy versions is inherited for those. This (or similar handling) is required to be compatible with NumPy 2/pandas, since the default needs to be to reject operation when values are out of bounds for e.g. `uint8_series + 1000`, the 1000 should not be silently cast to a `uint8`. --- Split from gh-15897 xref: https://github.com/rapidsai/build-planning/issues/38 Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16140 --- python/cudf/cudf/tests/test_scalar.py | 17 +++++++++++++++++ python/cudf/cudf/tests/test_unaops.py | 5 ++++- python/cudf/cudf/utils/dtypes.py | 14 ++++++++------ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 05a91a8fea3..195231e9960 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa import pytest +from packaging import version import rmm @@ -253,6 +254,22 @@ def test_generic_null_scalar_construction_fails(value): cudf.Scalar(value) +@pytest.mark.parametrize( + "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")] +) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_scalar_out_of_bounds_pyint_fails(value, dtype): + # Test that we align with NumPy on scalar creation behavior from + # Python integers. + if version.parse(np.__version__) >= version.parse("2.0"): + with pytest.raises(OverflowError): + cudf.Scalar(value, dtype) + else: + # NumPy allowed this, but it gives a DeprecationWarning on newer + # versions (which cudf did not used to do). + assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value) + + @pytest.mark.parametrize( "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] ) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index dbbf4fba3a6..5f5d79c1dce 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos(): @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos()) def test_scalar_unary_operations(slr, dtype, op): slr_host = np.array([slr])[0].astype(cudf.dtype(dtype)) - slr_device = cudf.Scalar(slr, dtype=dtype) + # The scalar may be out of bounds, so go via array force-cast + # NOTE: This is a change in behavior + slr = np.array(slr).astype(dtype)[()] + slr_device = cudf.Scalar(slr) expect = op(slr_host) got = op(slr_device) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2aa3129ab30..0dec857ea96 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -253,16 +253,18 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, datetime.timedelta): val = np.timedelta64(val) - val = _maybe_convert_to_default_type( - cudf.api.types.pandas_dtype(type(val)) - ).type(val) - if dtype is not None: - if isinstance(val, str) and np.dtype(dtype).kind == "M": + dtype = np.dtype(dtype) + if isinstance(val, str) and dtype.kind == "M": # pd.Timestamp can handle str, but not np.str_ val = pd.Timestamp(str(val)).to_datetime64().astype(dtype) else: - val = val.astype(dtype) + # At least datetimes cannot be converted to scalar via dtype.type: + val = np.array(val, dtype)[()] + else: + val = _maybe_convert_to_default_type( + cudf.api.types.pandas_dtype(type(val)) + ).type(val) if val.dtype.type is np.datetime64: time_unit, _ = np.datetime_data(val.dtype) From ceb73d91c090882ec69642a78b7d791a1bf220fe Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 15 Jul 2024 12:45:51 -0700 Subject: [PATCH 03/17] Make nvcomp adapter compatible with new version macros (#16245) New nvcomp version changed the names of the version macros. This PR adds "aliasing" to the old names so rest of the code is not affected. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - MithunR (https://github.com/mythrocks) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/16245 --- cpp/src/io/comp/nvcomp_adapter.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 0e34c96debd..5d0c6a8c83b 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -37,6 +37,13 @@ #include NVCOMP_ZSTD_HEADER #endif +// When building with nvcomp 4.0 or newer, map the new version macros to the old ones +#ifndef NVCOMP_MAJOR_VERSION +#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR +#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR +#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH +#endif + #define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3)) #define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4)) From 04330f2e9e73ac71a86666c55d0fe7248eaf8db6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:23:07 -1000 Subject: [PATCH 04/17] Fix convert_dtypes with convert_integer=False/convert_floating=True (#15964) If `convert_integer=False`, there should be no attempt to convert to integer Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15964 --- python/cudf/cudf/core/indexed_frame.py | 34 +++++++++++-------- .../cudf/cudf/tests/series/test_conversion.py | 13 +++++++ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 63fa96d0db0..30b68574960 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6235,13 +6235,13 @@ def rank( def convert_dtypes( self, - infer_objects=True, - convert_string=True, - convert_integer=True, - convert_boolean=True, - convert_floating=True, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, dtype_backend=None, - ): + ) -> Self: """ Convert columns to the best possible nullable dtypes. @@ -6252,17 +6252,21 @@ def convert_dtypes( All other dtypes are always returned as-is as all dtypes in cudf are nullable. """ - result = self.copy() - - if convert_floating: - # cast any floating columns to int64 if - # they are all integer data: - for name, col in result._data.items(): + if not (convert_floating and convert_integer): + return self.copy() + else: + cols = [] + for col in self._columns: if col.dtype.kind == "f": col = col.fillna(0) - if cp.allclose(col, col.astype("int64")): - result._data[name] = col.astype("int64") - return result + as_int = col.astype("int64") + if cp.allclose(col, as_int): + cols.append(as_int) + continue + cols.append(col) + return self._from_data_like_self( + self._data._from_columns_like_self(cols, verify=False) + ) @_warn_no_dask_cudf def __dask_tokenize__(self): diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py index e1dd359e1ba..1d680d7860d 100644 --- a/python/cudf/cudf/tests/series/test_conversion.py +++ b/python/cudf/cudf/tests/series/test_conversion.py @@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype): assert_eq(expect, got) +def test_convert_integer_false_convert_floating_true(): + data = [1.000000000000000000000000001, 1] + expected = pd.Series(data).convert_dtypes( + convert_integer=False, convert_floating=True + ) + result = ( + cudf.Series(data) + .convert_dtypes(convert_integer=False, convert_floating=True) + .to_pandas(nullable=True) + ) + assert_eq(result, expected) + + # Now write the same test, but construct a DataFrame # as input instead of parametrizing: From dba46e7a8957b8389b69e820485e319a1d314017 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 15:21:50 -1000 Subject: [PATCH 05/17] Replace is_datetime/timedelta_dtype checks with .kind checks (#16262) It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16262 --- python/cudf/cudf/_fuzz_testing/utils.py | 3 +- python/cudf/cudf/core/column/datetime.py | 7 +-- python/cudf/cudf/core/column/timedelta.py | 4 +- python/cudf/cudf/core/dataframe.py | 7 ++- python/cudf/cudf/core/scalar.py | 8 +--- python/cudf/cudf/core/tools/numeric.py | 9 +--- python/cudf/cudf/tests/test_binops.py | 7 +-- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_list.py | 7 +-- python/cudf/cudf/tests/test_scalar.py | 11 +---- python/cudf/cudf/utils/dtypes.py | 56 ++++++++--------------- 11 files changed, 37 insertions(+), 86 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index e6dfe2eae62..8ce92e1c0f6 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -192,8 +192,7 @@ def convert_nulls_to_none(records, df): col for col in df.columns if df[col].dtype in pandas_dtypes_to_np_dtypes - or pd.api.types.is_datetime64_dtype(df[col].dtype) - or pd.api.types.is_timedelta64_dtype(df[col].dtype) + or df[col].dtype.kind in "mM" ] for record in records: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 214e84028d2..409c44f6eee 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,7 +18,6 @@ from cudf import _lib as libcudf from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted -from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, @@ -565,10 +564,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # We check this on `other` before reflection since we already know the # dtype of `self`. - other_is_timedelta = is_timedelta64_dtype(other.dtype) - other_is_datetime64 = not other_is_timedelta and is_datetime64_dtype( - other.dtype - ) + other_is_timedelta = other.dtype.kind == "m" + other_is_datetime64 = other.dtype.kind == "M" lhs, rhs = (other, self) if reflect else (self, other) out_dtype = None diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2cbed9212de..36d7d9f9614 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -12,7 +12,7 @@ import cudf from cudf import _lib as libcudf -from cudf.api.types import is_scalar, is_timedelta64_dtype +from cudf.api.types import is_scalar from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype @@ -153,7 +153,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: this: ColumnBinaryOperand = self out_dtype = None - if is_timedelta64_dtype(other.dtype): + if other.dtype.kind == "m": # TODO: pandas will allow these operators to work but return false # when comparing to non-timedelta dtypes. We should do the same. if op in { diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f110b788789..2aa1b95e2d1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -33,7 +33,6 @@ from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, - is_datetime_dtype, is_dict_like, is_dtype_equal, is_list_like, @@ -6113,7 +6112,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): else: filtered = self.copy(deep=False) - is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes) + is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes) common_dtype = find_common_type(filtered.dtypes) if ( @@ -6510,7 +6509,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): cudf.utils.dtypes.get_min_float_dtype( prepared._data[col] ) - if not is_datetime_dtype(common_dtype) + if common_dtype.kind != "M" else cudf.dtype("float64") ) .fillna(np.nan) @@ -6537,7 +6536,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_dtype = ( common_dtype if method in type_coerced_methods - or is_datetime_dtype(common_dtype) + or (common_dtype is not None and common_dtype.kind == "M") else None ) result = column.as_column(result, dtype=result_dtype) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 29460d8c67e..f6331aa1f49 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype +from cudf.api.types import is_scalar from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand @@ -245,11 +245,7 @@ def _preprocess_host_value(self, value, dtype): dtype = cudf.dtype(dtype) if not valid: - value = ( - NaT - if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) - else NA - ) + value = NaT if dtype.kind in "mM" else NA return value, dtype diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index ef6b86a04a7..466d46f7dca 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,12 +8,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib import strings as libstrings -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_datetime_dtype, - is_string_dtype, - is_timedelta_dtype, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import can_convert_to_column @@ -114,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None): col = as_column(arg) dtype = col.dtype - if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): + if dtype.kind in "mM": col = col.astype(cudf.dtype("int64")) elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 5265278db4c..503b1a975b4 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1694,12 +1694,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): rhs = cudf.Scalar(cudf.NA, dtype=dtype_r) result = op(lhs, rhs) - assert result.value is ( - cudf.NaT - if cudf.api.types.is_datetime64_dtype(result.dtype) - or cudf.api.types.is_timedelta64_dtype(result.dtype) - else cudf.NA - ) + assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA) # make sure dtype is the same as had there been a valid scalar valid_lhs = cudf.Scalar(1, dtype=dtype_l) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f40106a30f4..7ccf83e424c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5457,9 +5457,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - if not numeric_only and not all( - cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes - ): + if not numeric_only and not all(dt.kind == "M" for dt in gdf.dtypes): with pytest.raises(TypeError): got = getattr(gdf, op)( axis=1, skipna=skipna, numeric_only=numeric_only diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index ec9d7995b05..36bcaa66d7d 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -694,12 +694,7 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level): dtype = cudf.ListDtype(dtype) slr = cudf.Scalar(None, dtype=dtype) - assert slr.value is ( - cudf.NaT - if cudf.api.types.is_datetime64_dtype(slr.dtype) - or cudf.api.types.is_timedelta64_dtype(slr.dtype) - else cudf.NA - ) + assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 195231e9960..f2faf4343b6 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -212,9 +212,7 @@ def test_scalar_roundtrip(value): ) def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) - if cudf.api.types.is_datetime64_dtype( - dtype - ) or cudf.api.types.is_timedelta64_dtype(dtype): + if s.dtype.kind in "mM": assert s.value is cudf.NaT else: assert s.value is cudf.NA @@ -369,12 +367,7 @@ def test_scalar_implicit_int_conversion(value): @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"})) def test_scalar_invalid_implicit_conversion(cls, dtype): try: - cls( - pd.NaT - if cudf.api.types.is_datetime64_dtype(dtype) - or cudf.api.types.is_timedelta64_dtype(dtype) - else pd.NA - ) + cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA) except TypeError as e: with pytest.raises(TypeError, match=re.escape(str(e))): slr = cudf.Scalar(None, dtype=dtype) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 0dec857ea96..59e5ec1df04 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -424,9 +424,7 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): dtype = cudf.dtype(dtype) - if pd.api.types.is_datetime64_dtype( - dtype - ) or pd.api.types.is_timedelta64_dtype(dtype): + if dtype.kind in "mM": time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) elif dtype.kind == "f": @@ -527,16 +525,14 @@ def find_common_type(dtypes): return cudf.dtype("O") # Aggregate same types - dtypes = set(dtypes) + dtypes = {cudf.dtype(dtype) for dtype in dtypes} + if len(dtypes) == 1: + return dtypes.pop() if any( isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes ): - if all( - cudf.api.types.is_decimal_dtype(dtype) - or cudf.api.types.is_numeric_dtype(dtype) - for dtype in dtypes - ): + if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes): return _find_common_type_decimal( [ dtype @@ -546,40 +542,28 @@ def find_common_type(dtypes): ) else: return cudf.dtype("O") - if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes): - if len(dtypes) == 1: - return dtypes.get(0) - else: - # TODO: As list dtypes allow casting - # to identical types, improve this logic of returning a - # common dtype, for example: - # ListDtype(int64) & ListDtype(int32) common - # dtype could be ListDtype(int64). - raise NotImplementedError( - "Finding a common type for `ListDtype` is currently " - "not supported" - ) - if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes): - if len(dtypes) == 1: - return dtypes.get(0) - else: - raise NotImplementedError( - "Finding a common type for `StructDtype` is currently " - "not supported" - ) + elif any( + isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)) + for dtype in dtypes + ): + # TODO: As list dtypes allow casting + # to identical types, improve this logic of returning a + # common dtype, for example: + # ListDtype(int64) & ListDtype(int32) common + # dtype could be ListDtype(int64). + raise NotImplementedError( + "Finding a common type for `ListDtype` or `StructDtype` is currently " + "not supported" + ) # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately - dt_dtypes = set( - filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes) - ) + dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes)) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) - td_dtypes = set( - filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes) - ) + td_dtypes = set(filter(lambda t: t.kind == "m", dtypes)) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) From 47a0a87db454cc767ab5f74beb2198a480d6f2c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:13:29 -1000 Subject: [PATCH 06/17] Type & reduce cupy usage (#16277) There are some cupy usages that don't seem _strictly_ necessary (generating starting data, array type conversion) in some APIs. IMO we should prefer using CPU data/the existing data structure/Column ops over cupy when possible closes https://github.com/rapidsai/cudf/issues/12133 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16277 --- python/cudf/cudf/core/_base_index.py | 4 ++- python/cudf/cudf/core/column/column.py | 8 +++--- python/cudf/cudf/core/column/datetime.py | 6 ++-- python/cudf/cudf/core/column/numerical.py | 10 ++----- python/cudf/cudf/core/cut.py | 6 ++-- python/cudf/cudf/core/dataframe.py | 18 +++++++----- python/cudf/cudf/core/frame.py | 6 ++-- python/cudf/cudf/core/groupby/groupby.py | 23 ++++++++------- python/cudf/cudf/core/index.py | 34 ++++++++++++----------- python/cudf/cudf/core/multiindex.py | 13 +++++---- python/cudf/cudf/core/tools/datetimes.py | 9 +++--- python/cudf/cudf/tests/test_datetime.py | 15 ++-------- 12 files changed, 74 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e160fa697ee..9ba2d161619 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -38,6 +38,8 @@ if TYPE_CHECKING: from collections.abc import Generator + import cupy + from cudf.core.column_accessor import ColumnAccessor @@ -2001,7 +2003,7 @@ def drop_duplicates( self._column_names, ) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> cupy.ndarray: """ Indicate duplicate index values. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f633d527681..fd3664ecac4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase: return result def indices_of( - self, value: ScalarLike | Self + self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: """ Find locations of value in the column @@ -735,10 +735,10 @@ def indices_of( ------- Column of indices that match value """ - if not isinstance(value, ColumnBase): - value = as_column([value], dtype=self.dtype) + if not is_scalar(value): + raise ValueError("value must be a scalar") else: - assert len(value) == 1 + value = as_column(value, dtype=self.dtype, length=1) mask = libcudf.search.contains(value, self) return apply_boolean_mask( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 409c44f6eee..004a059af95 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -629,9 +629,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: - value = column.as_column( - pd.to_datetime(value), dtype=self.dtype - ).astype("int64") + value = ( + pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64") + ) return self.astype("int64").indices_of(value) @property diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b8fa00e9643..7f05a5f91a1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -5,7 +5,6 @@ import functools from typing import TYPE_CHECKING, Any, Callable, Sequence, cast -import cupy as cp import numpy as np import pandas as pd from typing_extensions import Self @@ -13,7 +12,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib import pylibcudf -from cudf._lib.types import size_type_dtype from cudf.api.types import ( is_bool_dtype, is_float_dtype, @@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: and self.dtype.kind in {"c", "f"} and np.isnan(value) ): - return column.as_column( - cp.argwhere( - cp.isnan(self.data_array_view(mode="read")) - ).flatten(), - dtype=size_type_dtype, - ) + nan_col = libcudf.unary.is_nan(self) + return nan_col.indices_of(True) else: return super().indices_of(value) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index d9f62f51f92..197f46ee9fe 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -188,9 +188,6 @@ def cut( # adjust bin edges decimal precision int_label_bins = np.around(bins, precision) - # the inputs is a column of the values in the array x - input_arr = as_column(x) - # checking for the correct inclusivity values if right: closed = "right" @@ -242,6 +239,9 @@ def cut( labels if len(set(labels)) == len(labels) else None ) + # the inputs is a column of the values in the array x + input_arr = as_column(x) + if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns # we cannot typecast an IntervalIndex, so we need to diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2aa1b95e2d1..2121e623c1c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -429,7 +429,7 @@ def _setitem_tuple_arg(self, key, value): else: value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: # If the inner dimension is 1, it's broadcastable to # all columns of the dataframe. indexed_shape = columns_df.loc[key[0]].shape @@ -566,7 +566,7 @@ def _setitem_tuple_arg(self, key, value): # TODO: consolidate code path with identical counterpart # in `_DataFrameLocIndexer._setitem_tuple_arg` value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: indexed_shape = columns_df.iloc[key[0]].shape if value.shape[1] == 1: if value.shape[0] != indexed_shape[0]: @@ -2199,8 +2199,8 @@ def from_dict( orient = orient.lower() if orient == "index": - if len(data) > 0 and isinstance( - next(iter(data.values())), (cudf.Series, cupy.ndarray) + if isinstance( + next(iter(data.values()), None), (cudf.Series, cupy.ndarray) ): result = cls(data).T result.columns = ( @@ -5698,7 +5698,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): @classmethod @_performance_tracking - def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): + def _from_arrays( + cls, + data: np.ndarray | cupy.ndarray, + index=None, + columns=None, + nan_as_null=False, + ): """Convert a numpy/cupy array to DataFrame. Parameters @@ -5716,8 +5722,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): ------- DataFrame """ - - data = cupy.asarray(data) if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found: {data.ndim}" diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 253d200f7d4..802751e47ad 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1189,7 +1189,7 @@ def searchsorted( side: Literal["left", "right"] = "left", ascending: bool = True, na_position: Literal["first", "last"] = "last", - ): + ) -> ScalarLike | cupy.ndarray: """Find indices where elements should be inserted to maintain order Parameters @@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @acquire_spill_lock() def _apply_cupy_ufunc_to_operands( self, ufunc, cupy_func, operands, **kwargs - ): + ) -> list[dict[Any, ColumnBase]]: # Note: There are some operations that may be supported by libcudf but # are not supported by pandas APIs. In particular, libcudf binary # operations support logical and/or operations as well as @@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands( # without cupy. mask = None - data = [{} for _ in range(ufunc.nout)] + data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)] for name, (left, right, _, _) in operands.items(): cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index eccb3acabf6..8659d7c2392 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -35,7 +35,12 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + from cudf._typing import ( + AggType, + DataFrameOrSeries, + MultiColumnAggType, + ScalarLike, + ) def _deprecate_collect(): @@ -357,7 +362,7 @@ def groups(self): ) @cached_property - def indices(self): + def indices(self) -> dict[ScalarLike, cp.ndarray]: """ Dict {group name -> group indices}. @@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True): if ascending: # Count ascending from 0 to num_groups - 1 - group_ids = cudf.Series._from_data({None: cp.arange(num_groups)}) + groups = range(num_groups) elif has_null_group: # Count descending from num_groups - 1 to 0, but subtract one more # for the null group making it num_groups - 2 to -1. - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 2, -2, -1)} - ) + groups = range(num_groups - 2, -2, -1) else: # Count descending from num_groups - 1 to 0 - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 1, -1, -1)} - ) + groups = range(num_groups - 1, -1, -1) + + group_ids = cudf.Series._from_data({None: as_column(groups)}) if has_null_group: group_ids.iloc[-1] = cudf.NA @@ -1713,7 +1716,7 @@ def rolling_avg(val, avg): return grouped_values.apply_chunks(function, **kwargs) @_performance_tracking - def _broadcast(self, values): + def _broadcast(self, values: cudf.Series) -> cudf.Series: """ Broadcast the results of an aggregation to the group diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b398ee2343e..4164f981fca 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( idx: Index | cudf.MultiIndex, - key_as_table: Frame, + keys: list[ColumnBase], is_sorted: bool, ) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index @@ -118,13 +118,13 @@ def _lexsorted_equal_range( sort_vals = idx lower_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="right", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) @@ -260,7 +260,9 @@ def searchsorted( ), "Invalid ascending flag" return search_range(value, self._range, side=side) - def factorize(self, sort: bool = False, use_na_sentinel: bool = True): + def factorize( + self, sort: bool = False, use_na_sentinel: bool = True + ) -> tuple[cupy.ndarray, Self]: if sort and self.step < 0: codes = cupy.arange(len(self) - 1, -1, -1) uniques = self[::-1] @@ -753,15 +755,16 @@ def difference(self, other, sort=None): super().difference(other, sort=sort) ) - def _try_reconstruct_range_index(self, index): - if isinstance(index, RangeIndex) or index.dtype.kind == "f": + def _try_reconstruct_range_index( + self, index: BaseIndex + ) -> Self | BaseIndex: + if isinstance(index, RangeIndex) or index.dtype.kind not in "iu": return index # Evenly spaced values can return a # RangeIndex instead of a materialized Index. - if not index._column.has_nulls(): + if not index._column.has_nulls(): # type: ignore[attr-defined] uniques = cupy.unique(cupy.diff(index.values)) - if len(uniques) == 1 and uniques[0].get() != 0: - diff = uniques[0].get() + if len(uniques) == 1 and (diff := uniques[0].get()) != 0: new_range = range(index[0], index[-1] + diff, diff) return type(self)(new_range, name=index.name) return index @@ -1309,7 +1312,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _return_get_indexer_result(result_series.to_cupy()) @_performance_tracking - def get_loc(self, key): + def get_loc(self, key) -> int | slice | cupy.ndarray: if not is_scalar(key): raise TypeError("Should be a scalar-like") @@ -1317,9 +1320,8 @@ def get_loc(self, key): self.is_monotonic_increasing or self.is_monotonic_decreasing ) - target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, target_as_table, is_sorted + self, [as_column([key])], is_sorted ) if lower_bound == upper_bound: @@ -1330,7 +1332,7 @@ def get_loc(self, key): return ( lower_bound if is_sorted - else sort_inds.element_indexing(lower_bound) + else sort_inds.element_indexing(lower_bound) # type: ignore[union-attr] ) if is_sorted: @@ -1339,8 +1341,8 @@ def get_loc(self, key): return slice(lower_bound, upper_bound) # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).values + mask = cupy.full(len(self), False) + true_inds = sort_inds.slice(lower_bound, upper_bound).values # type: ignore[union-attr] mask[true_inds] = True return mask @@ -2076,7 +2078,7 @@ def day_of_year(self): @property # type: ignore @_performance_tracking - def is_leap_year(self): + def is_leap_year(self) -> cupy.ndarray: """ Boolean indicator if the date belongs to a leap year. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6503dae6ff5..3ed72ff812a 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1926,17 +1926,18 @@ def get_loc(self, key): # Handle partial key search. If length of `key` is less than `nlevels`, # Only search levels up to `len(key)` level. - key_as_table = cudf.core.frame.Frame( - {i: column.as_column(k, length=1) for i, k in enumerate(key)} - ) partial_index = self.__class__._from_data( - data=self._data.select_by_index(slice(key_as_table._num_columns)) + data=self._data.select_by_index(slice(len(key))) ) ( lower_bound, upper_bound, sort_inds, - ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) + ) = _lexsorted_equal_range( + partial_index, + [column.as_column(k, length=1) for k in key], + is_sorted, + ) if lower_bound == upper_bound: raise KeyError(key) @@ -1961,7 +1962,7 @@ def get_loc(self, key): return true_inds # Not sorted and not unique. Return a boolean mask - mask = cp.full(self._data.nrows, False) + mask = cp.full(len(self), False) mask[true_inds] = True return mask diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 064e8fc667d..c6e2b5d10e1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -6,7 +6,6 @@ import warnings from typing import Literal, Sequence -import cupy as cp import numpy as np import pandas as pd import pandas.tseries.offsets as pd_offset @@ -894,7 +893,7 @@ def date_range( # integers and divide the number range evenly with `periods` elements. start = cudf.Scalar(start, dtype=dtype).value.astype("int64") end = cudf.Scalar(end, dtype=dtype).value.astype("int64") - arr = cp.linspace(start=start, stop=end, num=periods) + arr = np.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz) @@ -991,8 +990,10 @@ def date_range( stop = end_estim.astype("int64") start = start.value.astype("int64") step = _offset_to_nanoseconds_lower_bound(offset) - arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") - res = cudf.core.column.as_column(arr).astype("datetime64[ns]") + arr = range(int(start), int(stop), step) + res = cudf.core.column.as_column(arr, dtype="int64").astype( + "datetime64[ns]" + ) return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize( tz diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 092e9790c63..7ab9ff2ef23 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods): ) -def test_date_range_start_end_freq(request, start, end, freq): - request.applymarker( - pytest.mark.xfail( - condition=( - start == "1831-05-08 15:23:21" - and end == "1996-11-21 04:05:30" - and freq == "110546789ms" - ), - reason="https://github.com/rapidsai/cudf/issues/12133", - ) - ) - +def test_date_range_start_end_freq(start, end, freq): if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq): ) -def test_date_range_start_freq_periods(request, start, freq, periods): +def test_date_range_start_freq_periods(start, freq, periods): if isinstance(freq, str): _gfreq = _pfreq = freq else: From beda22ed28030bbed2faaa5a49509255f11976aa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:29:05 -1000 Subject: [PATCH 07/17] Replace is_bool_type with checking .dtype.kind (#16255) It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16255 --- python/cudf/cudf/core/_base_index.py | 9 +++------ python/cudf/cudf/core/_internals/where.py | 8 ++------ python/cudf/cudf/core/column/column.py | 7 +++---- python/cudf/cudf/core/column/numerical.py | 5 ++--- python/cudf/cudf/core/dataframe.py | 13 ++++++------- python/cudf/cudf/core/groupby/groupby.py | 4 ++-- python/cudf/cudf/core/indexing_utils.py | 3 +-- python/cudf/cudf/core/multiindex.py | 4 ---- python/cudf/cudf/core/series.py | 11 +++++------ python/cudf/cudf/core/single_column_frame.py | 3 +-- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_index.py | 5 ++--- 12 files changed, 28 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 9ba2d161619..479f87bb78b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -20,7 +20,6 @@ from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( - is_bool_dtype, is_integer, is_integer_dtype, is_list_like, @@ -610,10 +609,8 @@ def union(self, other, sort=None): ) if cudf.get_option("mode.pandas_compatible"): - if ( - is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype) - ) or ( - not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype) + if (self.dtype.kind == "b" and other.dtype.kind != "b") or ( + self.dtype.kind != "b" and other.dtype.kind == "b" ): # Bools + other types will result in mixed type. # This is not yet consistent in pandas and specific to APIs. @@ -2154,7 +2151,7 @@ def _apply_boolean_mask(self, boolean_mask): Rows corresponding to `False` is dropped. """ boolean_mask = cudf.core.column.as_column(boolean_mask) - if not is_bool_dtype(boolean_mask.dtype): + if boolean_mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index f3183e6029d..4a36be76b6d 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -7,11 +7,7 @@ import numpy as np import cudf -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_scalar, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, @@ -112,7 +108,7 @@ def _check_and_cast_columns_with_other( other = cudf.Scalar(other) if is_mixed_with_object_dtype(other, source_col) or ( - is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype) + source_dtype.kind == "b" and common_dtype.kind != "b" ): raise TypeError(mixed_err) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fd3664ecac4..dbdf501e022 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -41,7 +41,6 @@ _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, - is_bool_dtype, is_dtype_equal, is_scalar, is_string_dtype, @@ -619,7 +618,7 @@ def _scatter_by_column( key: cudf.core.column.NumericalColumn, value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": # `key` is boolean mask if len(key) != len(self): raise ValueError( @@ -644,7 +643,7 @@ def _scatter_by_column( self._check_scatter_key_length(num_keys, value) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return libcudf.copying.boolean_mask_scatter([value], [self], key)[ 0 ]._with_type_metadata(self.dtype) @@ -1083,7 +1082,7 @@ def as_decimal_column( def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask) - if not is_bool_dtype(mask.dtype): + if mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return apply_boolean_mask([self], mask)[0]._with_type_metadata( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7f05a5f91a1..cea68c88c90 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -13,7 +13,6 @@ from cudf import _lib as libcudf from cudf._lib import pylibcudf from cudf.api.types import ( - is_bool_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -159,7 +158,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value) ) - if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype): + if self.dtype.kind != "b" and device_value.dtype.kind == "b": raise TypeError(f"Invalid value {value} for dtype {self.dtype}") else: device_value = device_value.astype(self.dtype) @@ -264,7 +263,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: f"{self.dtype.type.__name__} and " f"{other.dtype.type.__name__}" ) - if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype): + if self.dtype.kind == "b" or other.dtype.kind == "b": out_dtype = "bool" if ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2121e623c1c..b3d938829c9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -32,7 +32,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_dtype_equal, is_list_like, @@ -171,7 +170,7 @@ def _can_downcast_to_series(self, df, arg): ): return False else: - if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance( + if as_column(arg[0]).dtype.kind == "b" and not isinstance( arg[1], slice ): return True @@ -320,7 +319,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg[1], ) - if is_bool_dtype(tmp_arg[0].dtype): + if tmp_arg[0].dtype.kind == "b": df = columns_df._apply_boolean_mask( BooleanMask(tmp_arg[0], len(columns_df)) ) @@ -3678,8 +3677,8 @@ def agg(self, aggs, axis=None): """ dtypes = [self[col].dtype for col in self._column_names] common_dtype = find_common_type(dtypes) - if not is_bool_dtype(common_dtype) and any( - is_bool_dtype(dtype) for dtype in dtypes + if common_dtype.kind != "b" and any( + dtype.kind == "b" for dtype in dtypes ): raise MixedTypeError("Cannot create a column with mixed types") @@ -6305,8 +6304,8 @@ def _reduce( and any( not is_object_dtype(dtype) for dtype in source_dtypes ) - or not is_bool_dtype(common_dtype) - and any(is_bool_dtype(dtype) for dtype in source_dtypes) + or common_dtype.kind != "b" + and any(dtype.kind == "b" for dtype in source_dtypes) ): raise TypeError( "Columns must all have the same dtype to " diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8659d7c2392..d2c75715be2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,7 +22,7 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype +from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -1534,7 +1534,7 @@ def mult(df): # For `sum` & `product`, boolean types # will need to result in `int64` type. for name, col in res._data.items(): - if is_bool_dtype(col.dtype): + if col.dtype.kind == "b": res._data[name] = col.astype("int") return res diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index a5fed02cbed..9c81b0eb607 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -10,7 +10,6 @@ import cudf from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, is_integer_dtype, ) @@ -230,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): key = key.astype(key.codes.dtype) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: return EmptyIndexer() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3ed72ff812a..ff4b06c6334 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -841,10 +841,6 @@ def _get_row_major( | tuple[Any, ...] | list[tuple[Any, ...]], ) -> DataFrameOrSeries: - if pd.api.types.is_bool_dtype( - list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple - ): - return df[row_tuple] if isinstance(row_tuple, slice): if row_tuple.start is None: row_tuple = slice(self[0], row_tuple.stop, row_tuple.step) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8c8fa75918c..e12cc3d52fb 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -22,7 +22,6 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_integer, is_integer_dtype, @@ -221,10 +220,10 @@ def __setitem__(self, key, value): f"Cannot assign {value=} to " f"non-float dtype={self._frame.dtype}" ) - elif ( - self._frame.dtype.kind == "b" - and not is_bool_dtype(value) - and value not in {None, cudf.NA} + elif self._frame.dtype.kind == "b" and not ( + value in {None, cudf.NA} + or isinstance(value, (np.bool_, bool)) + or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b") ): raise MixedTypeError( f"Cannot assign {value=} to " @@ -3221,7 +3220,7 @@ def describe( percentiles = np.array([0.25, 0.5, 0.75]) dtype = "str" - if is_bool_dtype(self.dtype): + if self.dtype.kind == "b": data = _describe_categorical(self, percentiles) elif isinstance(self._column, cudf.core.column.NumericalColumn): data = _describe_numeric(self, percentiles) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9555aee6a2..04c7db7a53c 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -11,7 +11,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, is_integer_dtype, is_numeric_dtype, @@ -361,7 +360,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: arg = cudf.core.column.column_empty(0, dtype="int32") if is_integer_dtype(arg.dtype): return self._column.take(arg) - if is_bool_dtype(arg.dtype): + if arg.dtype.kind == "b": if (bn := len(arg)) != (n := len(self)): raise IndexError( f"Boolean mask has wrong length: {bn} not {n}" diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7ccf83e424c..2009fc49ce5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only): else (pdf[column].notna().count() == 0) ) or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or cudf.api.types.is_bool_dtype(pdf[column].dtype) + or pdf[column].dtype.kind == "b" for column in pdf ): with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05dcd85df6a..9eba6122d26 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -16,7 +16,6 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible): expected, actual, exact=False - if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype)) - or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype)) + if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") + or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") else True, ) From 669db3ea4a0c24a343c5619dd00904ad22ea215b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 16 Jul 2024 14:24:58 +0100 Subject: [PATCH 08/17] Fix logic in to_arrow for empty list column (#16279) An empty list column need not have empty children, it just needs to have zero length. In this case, the offsets array will have zero length, and we need to create a temporary buffer. Now that this branch runs, fix two errors in the construction of the arrow array: 1. The element type, if there are children, should be taken from the child array; 2. If the child arrays are empty, we must make an empty null array, rather than passing a null pointer as the values array, otherwise we hit a segfault inside arrow. The previous fix in #16201 correctly handled the empty children case (except for point two), but not the first case, which we do here. Since we we're previously going down this code path (child_arrays was never empty), we never hit the latent segfault from point two. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - David Wendt (https://github.com/davidwendt) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/16279 --- cpp/src/interop/to_arrow.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 8c4be1b50a5..622a3aba4bb 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -378,13 +378,11 @@ std::shared_ptr dispatch_to_arrow::operator()( auto children_meta = metadata.children_meta.empty() ? std::vector{{}, {}} : metadata.children_meta; auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream); - if (child_arrays.empty()) { - // Empty list will have only one value in offset of 4 bytes - auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr); - memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t)); - - return std::make_shared( - arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr); + if (child_arrays.empty() || child_arrays[0]->data()->length == 0) { + auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type(); + auto result = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr); + CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n"); + return result.ValueUnsafe(); } auto offset_buffer = child_arrays[0]->data()->buffers[1]; From a6de6cc23702ed71b80625f461a90e910a33642f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 16 Jul 2024 15:42:12 +0100 Subject: [PATCH 09/17] Introduce version file so we can conditionally handle things in tests (#16280) We decided we would attempt to support a range of versions back to 1.0. We'll test with oldest and newest versions we support. To facilitate, introduce some versioning constants. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/16280 --- python/cudf_polars/cudf_polars/dsl/ir.py | 7 ++++- .../cudf_polars/cudf_polars/utils/versions.py | 28 +++++++++++++++++++ python/cudf_polars/tests/test_scan.py | 5 ++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/cudf_polars/utils/versions.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 5e6544ef77c..cce0c4a3d94 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -313,7 +313,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if row_index is not None: + if ( + row_index is not None + # TODO: remove condition when dropping support for polars 1.0 + # https://github.com/pola-rs/polars/pull/17363 + and row_index[0] in self.schema + ): name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py new file mode 100644 index 00000000000..a9ac14c25aa --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Version utilities so that cudf_polars supports a range of polars versions.""" + +# ruff: noqa: SIM300 +from __future__ import annotations + +from packaging.version import parse + +from polars import __version__ + +POLARS_VERSION = parse(__version__) + +POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") +POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") +POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") +POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") +POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") +POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") + +POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") +POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") +POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") +POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") + +if POLARS_VERSION < parse("1.0"): # pragma: no cover + raise ImportError("cudf_polars requires py-polars v1.0 or greater.") diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index c41a94da14b..d0c41090433 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -10,6 +10,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils import versions @pytest.fixture( @@ -97,6 +98,10 @@ def test_scan_unsupported_raises(tmp_path): assert_ir_translation_raises(q, NotImplementedError) +@pytest.mark.xfail( + versions.POLARS_VERSION_LT_11, + reason="https://github.com/pola-rs/polars/issues/15730", +) def test_scan_row_index_projected_out(tmp_path): df = pl.DataFrame({"a": [1, 2, 3]}) From 3418f915d1a1ff82a72918d978924dfad2645a5a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 16 Jul 2024 12:38:47 -0500 Subject: [PATCH 10/17] Introduce dedicated options for low memory readers (#16289) This PR disables low memory readers by default in `cudf.pandas` and instead gives a provision to enable them with dedicated options. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16289 --- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/options.py | 26 ++++++++++++++++++++++++++ python/cudf/cudf/tests/test_json.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 2 +- 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 853dd431099..03bf9ed8b75 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -99,7 +99,7 @@ cpdef read_json(object filepaths_or_buffers, else: raise TypeError("`dtype` must be 'list like' or 'dict'") - if cudf.get_option("mode.pandas_compatible") and lines: + if cudf.get_option("io.json.low_memory") and lines: res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json( plc.io.SourceInfo(filepaths_or_buffers), processed_dtypes, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index fd0792b5edb..02b26ea1c01 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -916,7 +916,7 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) - if cudf.get_option("mode.pandas_compatible"): + if cudf.get_option("io.parquet.low_memory"): return libparquet.ParquetReader( filepaths_or_buffers, columns=columns, diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 1f539e7f266..94e73021cec 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -325,6 +325,32 @@ def _integer_and_none_validator(val): _make_contains_validator([False, True]), ) +_register_option( + "io.parquet.low_memory", + False, + textwrap.dedent( + """ + If set to `False`, reads entire parquet in one go. + If set to `True`, reads parquet file in chunks. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + +_register_option( + "io.json.low_memory", + False, + textwrap.dedent( + """ + If set to `False`, reads entire json in one go. + If set to `True`, reads json file in chunks. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + class option_context(ContextDecorator): """ diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 7771afd692f..c81c2d1d94b 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1441,6 +1441,6 @@ def test_chunked_json_reader(): df.to_json(buf, lines=True, orient="records", engine="cudf") buf.seek(0) df = df.to_pandas() - with cudf.option_context("mode.pandas_compatible", True): + with cudf.option_context("io.json.low_memory", True): gdf = cudf.read_json(buf, lines=True) assert_eq(df, gdf) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ff0c9040737..ecb7fd44422 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3772,6 +3772,6 @@ def test_parquet_reader_pandas_compatibility(): ) buffer = BytesIO() df.to_parquet(buffer) - with cudf.option_context("mode.pandas_compatible", True): + with cudf.option_context("io.parquet.low_memory", True): expected = cudf.read_parquet(buffer) assert_eq(expected, df) From e2b7e4370c8513811e9c72b30f499a5614b49f7c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Tue, 16 Jul 2024 14:20:00 -0400 Subject: [PATCH 11/17] Build and test with CUDA 12.5.1 (#16259) This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1. Contributes to https://github.com/rapidsai/build-planning/issues/73 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - James Lamb (https://github.com/jameslamb) - https://github.com/jakirkham URL: https://github.com/rapidsai/cudf/pull/16259 --- .../cuda12.2-conda/devcontainer.json | 8 ++-- .devcontainer/cuda12.2-pip/devcontainer.json | 10 ++-- .github/workflows/build.yaml | 20 ++++---- .github/workflows/pandas-tests.yaml | 4 +- .github/workflows/pr.yaml | 48 +++++++++---------- .../workflows/pr_issue_status_automation.yml | 6 +-- .github/workflows/test.yaml | 22 ++++----- CONTRIBUTING.md | 2 +- README.md | 2 +- ..._64.yaml => all_cuda-125_arch-x86_64.yaml} | 4 +- dependencies.yaml | 6 ++- 11 files changed, 68 insertions(+), 64 deletions(-) rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (97%) diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 05bf9173d25..fadce01d060 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -3,7 +3,7 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04" } @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { @@ -20,7 +20,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -29,7 +29,7 @@ "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index 74420214726..026eb540952 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -3,15 +3,15 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { @@ -20,7 +20,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -28,7 +28,7 @@ "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 2e5959338b0..937080572ad 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -101,7 +101,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -111,7 +111,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index a8643923a4d..1516cb09449 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,9 +17,9 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ceee9074b93..1fe64e7f318 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -34,41 +34,41 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: pull-request script: "ci/test_python_cudf.sh" @@ -76,14 +76,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: pull-request script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -93,7 +93,7 @@ jobs: static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -103,7 +103,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -113,7 +113,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -123,21 +123,21 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-cudf-polars: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-tests-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -157,7 +157,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -166,7 +166,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -174,10 +174,10 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1 with: arch: '["amd64"]' - cuda: '["12.2"]' + cuda: '["12.5"]' build_command: | sccache -z; build-all -DBUILD_BENCHMARKS=ON --verbose; @@ -185,7 +185,7 @@ jobs: unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request @@ -194,9 +194,9 @@ jobs: # run the Pandas unit tests using PR branch needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. @@ -204,7 +204,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 8ca971dc28d..2a8ebd30993 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 36c9088d93c..73f8d726e77 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -45,7 +45,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -85,7 +85,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -97,7 +97,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4fbc28fa6e1..f9cdde7c2b7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -104,7 +104,7 @@ Instructions for a minimal build environment without conda are included below. # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml +conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` diff --git a/README.md b/README.md index 17d2df9a936..1ab6a2d7457 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.08 python=3.11 cuda-version=12.2 + cudf=24.08 python=3.11 cuda-version=12.5 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml similarity index 97% rename from conda/environments/all_cuda-122_arch-x86_64.yaml rename to conda/environments/all_cuda-125_arch-x86_64.yaml index c32d21c5d36..3f5fae49cbb 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -23,7 +23,7 @@ dependencies: - cuda-nvtx-dev - cuda-python>=12.0,<13.0a0 - cuda-sanitizer-api -- cuda-version=12.2 +- cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 @@ -96,4 +96,4 @@ dependencies: - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master -name: all_cuda-122_arch-x86_64 +name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 27621ff9a3f..67ed3773b44 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,7 +3,7 @@ files: all: output: conda matrix: - cuda: ["11.8", "12.2"] + cuda: ["11.8", "12.5"] arch: [x86_64] includes: - build_base @@ -402,6 +402,10 @@ dependencies: cuda: "12.2" packages: - cuda-version=12.2 + - matrix: + cuda: "12.5" + packages: + - cuda-version=12.5 cuda: specific: - output_types: conda From 05ea7c9cf6a0fd39384e2044b4c9b46f543d4ad0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:51:20 -0700 Subject: [PATCH 12/17] Fix tests for polars 1.2 (#16292) Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16292 --- python/cudf_polars/tests/test_groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 50adca01950..b07d8e38217 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -12,6 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils import versions @pytest.fixture @@ -100,7 +101,7 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): with pytest.raises(AssertionError): # https://github.com/pola-rs/polars/issues/17556 assert_gpu_result_equal(q, check_exact=False) - if schema[sort_keys[1]] == pl.Boolean(): + if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): # https://github.com/pola-rs/polars/issues/17557 with pytest.raises(AssertionError): assert_gpu_result_equal(qsorted, check_exact=False) From 62191103032706371d76ce83c6ec59d13376b231 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:23:49 -0400 Subject: [PATCH 13/17] [BUG] Make name attr of Index fast slow attrs (#16270) Debugging the spike in failures from #16234 Authors: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16270 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 36 ++++++++------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index d3a3488081a..59a243dd7c4 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -260,18 +260,14 @@ def Index__new__(cls, *args, **kwargs): return self -def name(self): - return self._fsproxy_wrapped._name - - def Index__setattr__(self, name, value): if name.startswith("_"): object.__setattr__(self, name, value) return if name == "name": - setattr(self._fsproxy_wrapped, "_name", value) + setattr(self._fsproxy_wrapped, "name", value) if name == "names": - setattr(self._fsproxy_wrapped, "_names", value) + setattr(self._fsproxy_wrapped, "names", value) return _FastSlowAttribute("__setattr__").__get__(self, type(self))( name, value ) @@ -300,7 +296,7 @@ def Index__setattr__(self, name, value): "_accessors": set(), "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -314,7 +310,7 @@ def Index__setattr__(self, name, value): additional_attributes={ "__init__": _DELETE, "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -345,7 +341,7 @@ def Index__setattr__(self, name, value): additional_attributes={ "__init__": _DELETE, "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -375,10 +371,10 @@ def Index__setattr__(self, name, value): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -412,10 +408,10 @@ def Index__setattr__(self, name, value): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -470,10 +466,10 @@ def Index__setattr__(self, name, value): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) @@ -508,10 +504,6 @@ def Index__setattr__(self, name, value): ) -def names(self): - return self._fsproxy_wrapped._names - - MultiIndex = make_final_proxy_type( "MultiIndex", cudf.MultiIndex, @@ -522,7 +514,7 @@ def names(self): additional_attributes={ "__init__": _DELETE, "__setattr__": Index__setattr__, - "name": property(names), + "names": _FastSlowAttribute("names"), }, ) @@ -709,10 +701,10 @@ def names(self): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), - "__setattr__": Index__setattr__, - "name": property(name), + "name": _FastSlowAttribute("name"), }, ) From 6a954e299d97f69a62fd184529fa7d5f29c0e09f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:02:20 -0700 Subject: [PATCH 14/17] Migrate expressions to pylibcudf (#16056) xref #15162 Migrates expresions to use pylibcudf. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16056 --- .../api_docs/pylibcudf/datetime.rst | 6 +- .../api_docs/pylibcudf/expressions.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 3 +- python/cudf/cudf/_lib/expressions.pyx | 156 -------------- python/cudf/cudf/_lib/parquet.pyx | 2 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 1 + .../cudf/_lib/{ => pylibcudf}/expressions.pxd | 29 ++- .../cudf/cudf/_lib/pylibcudf/expressions.pyx | 195 ++++++++++++++++++ .../_lib/pylibcudf/libcudf/CMakeLists.txt | 4 +- .../_lib/pylibcudf/libcudf/expressions.pxd | 103 ++++----- .../_lib/pylibcudf/libcudf/expressions.pyx | 0 python/cudf/cudf/_lib/transform.pyx | 2 +- .../cudf/cudf/core/_internals/expressions.py | 11 +- .../cudf/pylibcudf_tests/test_expressions.py | 50 +++++ 18 files changed, 335 insertions(+), 237 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst delete mode 100644 python/cudf/cudf/_lib/expressions.pyx rename python/cudf/cudf/_lib/{ => pylibcudf}/expressions.pxd (50%) create mode 100644 python/cudf/cudf/_lib/pylibcudf/expressions.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_expressions.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst index ebf5fab3052..558268ea495 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst @@ -1,6 +1,6 @@ -======= -copying -======= +======== +datetime +======== .. automodule:: cudf._lib.pylibcudf.datetime :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst new file mode 100644 index 00000000000..03f769ee861 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst @@ -0,0 +1,6 @@ +=========== +expressions +=========== + +.. automodule:: cudf._lib.pylibcudf.expressions + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 5899d272160..505765bba0f 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -15,6 +15,7 @@ This page provides API documentation for pylibcudf. concatenate copying datetime + expressions filling gpumemoryview groupby diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 5a067e84f56..38b7e9ebe04 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -21,7 +21,6 @@ set(cython_sources copying.pyx csv.pyx datetime.pyx - expressions.pyx filling.pyx groupby.pyx hash.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 18b95f5f2e1..34c0e29d0b1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np from . import ( @@ -8,7 +8,6 @@ copying, csv, datetime, - expressions, filling, groupby, hash, diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx deleted file mode 100644 index 3fb29279ed7..00000000000 --- a/python/cudf/cudf/_lib/expressions.pyx +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from enum import Enum - -import numpy as np - -from cython.operator cimport dereference -from libc.stdint cimport int64_t -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport ( - timestamp_ms, - timestamp_us, -) - -# Necessary for proper casting, see below. -ctypedef int32_t underlying_type_ast_operator - - -# Aliases for simplicity -ctypedef unique_ptr[libcudf_exp.expression] expression_ptr - - -class ASTOperator(Enum): - ADD = libcudf_exp.ast_operator.ADD - SUB = libcudf_exp.ast_operator.SUB - MUL = libcudf_exp.ast_operator.MUL - DIV = libcudf_exp.ast_operator.DIV - TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV - FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV - MOD = libcudf_exp.ast_operator.MOD - PYMOD = libcudf_exp.ast_operator.PYMOD - POW = libcudf_exp.ast_operator.POW - EQUAL = libcudf_exp.ast_operator.EQUAL - NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL - NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL - LESS = libcudf_exp.ast_operator.LESS - GREATER = libcudf_exp.ast_operator.GREATER - LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL - GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL - BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND - BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR - BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR - LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND - NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND - LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR - NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR - # Unary operators - IDENTITY = libcudf_exp.ast_operator.IDENTITY - IS_NULL = libcudf_exp.ast_operator.IS_NULL - SIN = libcudf_exp.ast_operator.SIN - COS = libcudf_exp.ast_operator.COS - TAN = libcudf_exp.ast_operator.TAN - ARCSIN = libcudf_exp.ast_operator.ARCSIN - ARCCOS = libcudf_exp.ast_operator.ARCCOS - ARCTAN = libcudf_exp.ast_operator.ARCTAN - SINH = libcudf_exp.ast_operator.SINH - COSH = libcudf_exp.ast_operator.COSH - TANH = libcudf_exp.ast_operator.TANH - ARCSINH = libcudf_exp.ast_operator.ARCSINH - ARCCOSH = libcudf_exp.ast_operator.ARCCOSH - ARCTANH = libcudf_exp.ast_operator.ARCTANH - EXP = libcudf_exp.ast_operator.EXP - LOG = libcudf_exp.ast_operator.LOG - SQRT = libcudf_exp.ast_operator.SQRT - CBRT = libcudf_exp.ast_operator.CBRT - CEIL = libcudf_exp.ast_operator.CEIL - FLOOR = libcudf_exp.ast_operator.FLOOR - ABS = libcudf_exp.ast_operator.ABS - RINT = libcudf_exp.ast_operator.RINT - BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT - NOT = libcudf_exp.ast_operator.NOT - - -class TableReference(Enum): - LEFT = libcudf_exp.table_reference.LEFT - RIGHT = libcudf_exp.table_reference.RIGHT - - -# Note that this function only currently supports numeric literals. libcudf -# expressions don't really support other types yet though, so this isn't -# restrictive at the moment. -cdef class Literal(Expression): - def __cinit__(self, value): - if isinstance(value, int): - self.c_scalar.reset(new numeric_scalar[int64_t](value, True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, float): - self.c_scalar.reset(new numeric_scalar[double](value, True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, str): - self.c_scalar.reset(new string_scalar(value.encode(), True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, np.datetime64): - scale, _ = np.datetime_data(value.dtype) - int_value = value.astype(np.int64) - if scale == "ms": - self.c_scalar.reset(new timestamp_scalar[timestamp_ms]( - int_value, True) - ) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif scale == "us": - self.c_scalar.reset(new timestamp_scalar[timestamp_us]( - int_value, True) - ) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - else: - raise NotImplementedError( - f"Unhandled datetime scale {scale=}" - ) - else: - raise NotImplementedError( - f"Don't know how to make literal with type {type(value)}" - ) - - -cdef class ColumnReference(Expression): - def __cinit__(self, size_type index): - self.c_obj = move(make_unique[libcudf_exp.column_reference]( - index - )) - - -cdef class Operation(Expression): - def __cinit__(self, op, Expression left, Expression right=None): - cdef libcudf_exp.ast_operator op_value = ( - op.value - ) - - if right is None: - self.c_obj = move(make_unique[libcudf_exp.operation]( - op_value, dereference(left.c_obj) - )) - else: - self.c_obj = move(make_unique[libcudf_exp.operation]( - op_value, dereference(left.c_obj), dereference(right.c_obj) - )) - -cdef class ColumnNameReference(Expression): - def __cinit__(self, string name): - self.c_obj = \ - move(make_unique[libcudf_exp.column_name_reference](name)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 158fb6051c3..e7959d21e01 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types cimport cudf._lib.pylibcudf.libcudf.types as cudf_types from cudf._lib.column cimport Column -from cudf._lib.expressions cimport Expression from cudf._lib.io.utils cimport ( make_sinks_info, make_source_info, update_struct_field_names, ) +from cudf._lib.pylibcudf.expressions cimport Expression from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource from cudf._lib.pylibcudf.libcudf.expressions cimport expression from cudf._lib.pylibcudf.libcudf.io.parquet cimport ( diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index a2d11bbea6e..0800fa18e94 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -20,6 +20,7 @@ set(cython_sources concatenate.pyx copying.pyx datetime.pyx + expressions.pyx filling.pyx gpumemoryview.pyx groupby.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index da2b7806203..26e89b818d3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( concatenate, copying, datetime, + expressions, filling, groupby, join, diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index acbc84d7177..e89a5ed9f96 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -7,6 +7,7 @@ concatenate, copying, datetime, + expressions, filling, groupby, interop, diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd similarity index 50% rename from python/cudf/cudf/_lib/expressions.pxd rename to python/cudf/cudf/_lib/pylibcudf/expressions.pxd index 4a20c5fc545..64825b89d9f 100644 --- a/python/cudf/cudf/_lib/expressions.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd @@ -1,36 +1,31 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t, int64_t +# Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from cudf._lib.pylibcudf.libcudf.expressions cimport ( - column_reference, + ast_operator, expression, - literal, - operation, -) -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( - numeric_scalar, - scalar, - string_scalar, - timestamp_scalar, + table_reference, ) +from .scalar cimport Scalar + cdef class Expression: cdef unique_ptr[expression] c_obj - cdef class Literal(Expression): - cdef unique_ptr[scalar] c_scalar - + # Hold on to input scalar so it doesn't get gc'ed + cdef Scalar scalar cdef class ColumnReference(Expression): pass - cdef class Operation(Expression): - pass + # Hold on to the input expressions so + # they don't get gc'ed + cdef Expression right + cdef Expression left cdef class ColumnNameReference(Expression): pass diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx new file mode 100644 index 00000000000..38de11406ad --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx @@ -0,0 +1,195 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.expressions import \ + ast_operator as ASTOperator # no-cython-lint +from cudf._lib.pylibcudf.libcudf.expressions import \ + table_reference as TableReference # no-cython-lint + +from cython.operator cimport dereference +from libc.stdint cimport int32_t, int64_t +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( + duration_scalar, + numeric_scalar, + string_scalar, + timestamp_scalar, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id +from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport ( + duration_ms, + duration_ns, + duration_s, + duration_us, +) +from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport ( + timestamp_ms, + timestamp_ns, + timestamp_s, + timestamp_us, +) + +from .scalar cimport Scalar +from .traits cimport is_chrono, is_numeric +from .types cimport DataType + +# Aliases for simplicity +ctypedef unique_ptr[libcudf_exp.expression] expression_ptr + +cdef class Literal(Expression): + """ + A literal value used in an abstract syntax tree. + + For details, see :cpp:class:`cudf::ast::literal`. + + Parameters + ---------- + value : Scalar + The Scalar value of the Literal. + Must be either numeric, string, or a timestamp/duration scalar. + """ + def __cinit__(self, Scalar value): + self.scalar = value + cdef DataType typ = value.type() + cdef type_id tid = value.type().id() + if not (is_numeric(typ) or is_chrono(typ) or tid == type_id.STRING): + raise ValueError( + "Only numeric, string, or timestamp/duration scalars are accepted" + ) + # TODO: Accept type-erased scalar in AST C++ code + # Then a lot of this code can be deleted + if tid == type_id.INT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.INT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.FLOAT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.FLOAT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.STRING: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_NANOSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MICROSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_SECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_NANOSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MICROSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_SECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + else: + raise NotImplementedError( + f"Don't know how to make literal with type id {tid}" + ) + +cdef class ColumnReference(Expression): + """ + An expression referring to data from a column in a table. + + For details, see :cpp:class:`cudf::ast::column_reference`. + + Parameters + ---------- + index : size_type + The index of this column in the table + (provided when the expression is evaluated). + table_source : TableReference, default TableReferenece.LEFT + Which table to use in cases with two tables (e.g. joins) + """ + def __cinit__( + self, + size_type index, + table_reference table_source=table_reference.LEFT + ): + self.c_obj = move(make_unique[libcudf_exp.column_reference]( + index, table_source + )) + + +cdef class Operation(Expression): + """ + An operation expression holds an operator and zero or more operands. + + For details, see :cpp:class:`cudf::ast::operation`. + + Parameters + ---------- + op : Operator + left : Expression + Left input expression (left operand) + right: Expression, default None + Right input expression (right operand). + You should only pass this if the input expression is a binary operation. + """ + def __cinit__(self, ast_operator op, Expression left, Expression right=None): + self.left = left + self.right = right + if right is None: + self.c_obj = move(make_unique[libcudf_exp.operation]( + op, dereference(left.c_obj) + )) + else: + self.c_obj = move(make_unique[libcudf_exp.operation]( + op, dereference(left.c_obj), dereference(right.c_obj) + )) + +cdef class ColumnNameReference(Expression): + """ + An expression referring to data from a column in a table. + + For details, see :cpp:class:`cudf::ast::column_name_reference`. + + Parameters + ---------- + column_name : str + Name of this column in the table metadata + (provided when the expression is evaluated). + """ + def __cinit__(self, str name): + self.c_obj = \ + move(make_unique[libcudf_exp.column_name_reference]( + (name.encode("utf-8")) + )) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 699e85ce567..b04e94f1546 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx - stream_compaction.pyx types.pyx unary.pyx +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx + round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd index 279d969db50..427e16d4ff8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -14,63 +15,63 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: - ctypedef enum ast_operator: + cpdef enum class ast_operator(int32_t): # Binary operators - ADD "cudf::ast::ast_operator::ADD" - SUB "cudf::ast::ast_operator::SUB" - MUL "cudf::ast::ast_operator::MUL" - DIV "cudf::ast::ast_operator::DIV" - TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV" - FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV" - MOD "cudf::ast::ast_operator::MOD" - PYMOD "cudf::ast::ast_operator::PYMOD" - POW "cudf::ast::ast_operator::POW" - EQUAL "cudf::ast::ast_operator::EQUAL" - NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" - NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" - LESS "cudf::ast::ast_operator::LESS" - GREATER "cudf::ast::ast_operator::GREATER" - LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL" - GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL" - BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" - BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" - BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" - NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" - LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" - NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" - LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" + ADD + SUB + MUL + DIV + TRUE_DIV + FLOOR_DIV + MOD + PYMOD + POW + EQUAL + NULL_EQUAL + NOT_EQUAL + LESS + GREATER + LESS_EQUAL + GREATER_EQUAL + BITWISE_AND + BITWISE_OR + BITWISE_XOR + NULL_LOGICAL_AND + LOGICAL_AND + NULL_LOGICAL_OR + LOGICAL_OR # Unary operators - IDENTITY "cudf::ast::ast_operator::IDENTITY" - IS_NULL "cudf::ast::ast_operator::IS_NULL" - SIN "cudf::ast::ast_operator::SIN" - COS "cudf::ast::ast_operator::COS" - TAN "cudf::ast::ast_operator::TAN" - ARCSIN "cudf::ast::ast_operator::ARCSIN" - ARCCOS "cudf::ast::ast_operator::ARCCOS" - ARCTAN "cudf::ast::ast_operator::ARCTAN" - SINH "cudf::ast::ast_operator::SINH" - COSH "cudf::ast::ast_operator::COSH" - TANH "cudf::ast::ast_operator::TANH" - ARCSINH "cudf::ast::ast_operator::ARCSINH" - ARCCOSH "cudf::ast::ast_operator::ARCCOSH" - ARCTANH "cudf::ast::ast_operator::ARCTANH" - EXP "cudf::ast::ast_operator::EXP" - LOG "cudf::ast::ast_operator::LOG" - SQRT "cudf::ast::ast_operator::SQRT" - CBRT "cudf::ast::ast_operator::CBRT" - CEIL "cudf::ast::ast_operator::CEIL" - FLOOR "cudf::ast::ast_operator::FLOOR" - ABS "cudf::ast::ast_operator::ABS" - RINT "cudf::ast::ast_operator::RINT" - BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" - NOT "cudf::ast::ast_operator::NOT" + IDENTITY + IS_NULL + SIN + COS + TAN + ARCSIN + ARCCOS + ARCTAN + SINH + COSH + TANH + ARCSINH + ARCCOSH + ARCTANH + EXP + LOG + SQRT + CBRT + CEIL + FLOOR + ABS + RINT + BIT_INVERT + NOT cdef cppclass expression: pass - ctypedef enum table_reference: - LEFT "cudf::ast::table_reference::LEFT" - RIGHT "cudf::ast::table_reference::RIGHT" + cpdef enum class table_reference(int32_t): + LEFT + RIGHT cdef cppclass literal(expression): # Due to https://github.com/cython/cython/issues/3198, we need to diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 86a4a60eef1..622725e06a3 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -19,8 +19,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform from cudf._lib.column cimport Column -from cudf._lib.expressions cimport Expression from cudf._lib.pylibcudf cimport transform as plc_transform +from cudf._lib.pylibcudf.expressions cimport Expression from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.expressions cimport expression diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 393a68dd844..63714a78572 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -4,7 +4,10 @@ import ast import functools -from cudf._lib.expressions import ( +import pyarrow as pa + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.expressions import ( ASTOperator, ColumnReference, Expression, @@ -122,7 +125,9 @@ def visit_Constant(self, node): f"Unsupported literal {repr(node.value)} of type " "{type(node.value).__name__}" ) - self.stack.append(Literal(node.value)) + self.stack.append( + Literal(plc.interop.from_arrow(pa.scalar(node.value))) + ) def visit_UnaryOp(self, node): self.visit(node.operand) @@ -132,7 +137,7 @@ def visit_UnaryOp(self, node): # operand, so there's no way to know whether this should be a float # or an int. We should maybe see what Spark does, and this will # probably require casting. - self.nodes.append(Literal(-1)) + self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1)))) op = ASTOperator.MUL self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) elif isinstance(node.op, ast.UAdd): diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/cudf/cudf/pylibcudf_tests/test_expressions.py new file mode 100644 index 00000000000..f661512caad --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_expressions.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + +# We can't really evaluate these expressions, so just make sure +# construction works properly + + +def test_literal_construction_invalid(): + with pytest.raises(ValueError): + plc.expressions.Literal( + plc.interop.from_arrow(pa.scalar(None, type=pa.list_(pa.int64()))) + ) + + +@pytest.mark.parametrize( + "tableref", + [ + plc.expressions.TableReference.LEFT, + plc.expressions.TableReference.RIGHT, + ], +) +def test_columnref_construction(tableref): + plc.expressions.ColumnReference(1.0, tableref) + + +def test_columnnameref_construction(): + plc.expressions.ColumnNameReference("abc") + + +@pytest.mark.parametrize( + "kwargs", + [ + # Unary op + { + "op": plc.expressions.ASTOperator.IDENTITY, + "left": plc.expressions.ColumnReference(1), + }, + # Binop + { + "op": plc.expressions.ASTOperator.ADD, + "left": plc.expressions.ColumnReference(1), + "right": plc.expressions.ColumnReference(2), + }, + ], +) +def test_astoperation_construction(kwargs): + plc.expressions.Operation(**kwargs) From 2f8d514b1687164a94bbe89da1dab8eb37682b35 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 16 Jul 2024 20:15:25 -0400 Subject: [PATCH 15/17] Remove xml from sort_ninja_log.py utility (#16274) Removes xml support from the `sort_ninja_log.py` utility. The xml support was experimental for possible use with Jenkins reporting that never materialized. This script is used in build.sh generally when running local builds. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Srinivas Yadav (https://github.com/srinivasyadav18) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16274 --- cpp/scripts/sort_ninja_log.py | 58 ++++++----------------------------- 1 file changed, 9 insertions(+), 49 deletions(-) diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index 3fe503f749e..42f84e4d0c7 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # import argparse import os @@ -9,14 +9,12 @@ from xml.dom import minidom parser = argparse.ArgumentParser() -parser.add_argument( - "log_file", type=str, default=".ninja_log", help=".ninja_log file" -) +parser.add_argument("log_file", type=str, default=".ninja_log", help=".ninja_log file") parser.add_argument( "--fmt", type=str, default="csv", - choices=["csv", "xml", "html"], + choices=["csv", "html"], help="output format (to stdout)", ) parser.add_argument( @@ -37,6 +35,7 @@ output_fmt = args.fmt cmp_file = args.cmp_log + # build a map of the log entries def build_log_map(log_file): entries = {} @@ -68,37 +67,6 @@ def build_log_map(log_file): return entries -# output results in XML format -def output_xml(entries, sorted_list, args): - root = ET.Element("testsuites") - testsuite = ET.Element( - "testsuite", - attrib={ - "name": "build-time", - "tests": str(len(sorted_list)), - "failures": str(0), - "errors": str(0), - }, - ) - root.append(testsuite) - for name in sorted_list: - entry = entries[name] - build_time = float(entry[1] - entry[0]) / 1000 - item = ET.Element( - "testcase", - attrib={ - "classname": "BuildTime", - "name": name, - "time": str(build_time), - }, - ) - testsuite.append(item) - - tree = ET.ElementTree(root) - xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") - print(xmlstr) - - # utility converts a millisecond value to a column width in pixels def time_to_width(value, end): # map a value from (0,end) to (0,1000) @@ -282,9 +250,7 @@ def output_html(entries, sorted_list, cmp_entries, args): # output detail table in build-time descending order print("") - print( - "", "", "", sep="" - ) + print("", "", "", sep="") if cmp_entries: print("", sep="") print("") @@ -303,9 +269,7 @@ def output_html(entries, sorted_list, cmp_entries, args): print("", sep="", end="") print("", sep="", end="") # output diff column - cmp_entry = ( - cmp_entries[name] if cmp_entries and name in cmp_entries else None - ) + cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None if cmp_entry: diff_time = build_time - (cmp_entry[1] - cmp_entry[0]) diff_time_str = format_build_time(diff_time) @@ -353,7 +317,7 @@ def output_html(entries, sorted_list, cmp_entries, args): print( "time change < 20%% or build time < 1 minute", + ">time change < 20% or build time < 1 minute", ) print("
FileCompile timeSize
FileCompile timeSizet-cmp
", build_time_str, "", file_size_str, "
") @@ -370,9 +334,7 @@ def output_csv(entries, sorted_list, cmp_entries, args): entry = entries[name] build_time = entry[1] - entry[0] file_size = entry[2] - cmp_entry = ( - cmp_entries[name] if cmp_entries and name in cmp_entries else None - ) + cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None print(build_time, file_size, name, sep=",", end="") if cmp_entry: diff_time = build_time - (cmp_entry[1] - cmp_entry[0]) @@ -396,9 +358,7 @@ def output_csv(entries, sorted_list, cmp_entries, args): # load the comparison build log if available cmp_entries = build_log_map(cmp_file) if cmp_file else None -if output_fmt == "xml": - output_xml(entries, sorted_list, args) -elif output_fmt == "html": +if output_fmt == "html": output_html(entries, sorted_list, cmp_entries, args) else: output_csv(entries, sorted_list, cmp_entries, args) From 093bcc94ccf156a7e39339a7c4bb7e86543187de Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 16 Jul 2024 20:16:07 -0400 Subject: [PATCH 16/17] Update cudf::detail::grid_1d to use thread_index_type (#16276) Updates the `cudf::detail::grid_1d` to use `thread_index_type` instead of `int` and `size_type` for the number threads and blocks. This has become important for launching kernels with more threads than max `size_type` total bytes for warp-per-row and thread-per-byte algorithms. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16276 --- cpp/include/cudf/detail/utilities/cuda.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index f1775c6d6d7..5007af7f9f1 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -41,8 +41,8 @@ static constexpr size_type warp_size{32}; */ class grid_1d { public: - int const num_threads_per_block; - int const num_blocks; + thread_index_type const num_threads_per_block; + thread_index_type const num_blocks; /** * @param overall_num_elements The number of elements the kernel needs to * handle/process, in its main, one-dimensional/linear input (e.g. one or more @@ -55,9 +55,9 @@ class grid_1d { * than a single element; this affects the number of threads the grid must * contain */ - grid_1d(cudf::size_type overall_num_elements, - cudf::size_type num_threads_per_block, - cudf::size_type elements_per_thread = 1) + grid_1d(thread_index_type overall_num_elements, + thread_index_type num_threads_per_block, + thread_index_type elements_per_thread = 1) : num_threads_per_block(num_threads_per_block), num_blocks(util::div_rounding_up_safe(overall_num_elements, elements_per_thread * num_threads_per_block)) From aa466aaf91bc329cc4fced9b9a3426d79bfe7ffc Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 17 Jul 2024 10:48:17 -0400 Subject: [PATCH 17/17] Move kernel vis over to CUDF_HIDDEN (#16165) Use CUDF_HIDDEN instead of the raw `__attribute__((visibility("hidden")))` for symbol visibility controls on the CUDA kernels that we call from multiple TUs. This is primarily a style change so that we have consistent visibility markup across the entire project Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Yunsong Wang (https://github.com/PointKernel) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/16165 --- cpp/src/join/mixed_join_kernel.cuh | 3 ++- cpp/src/join/mixed_join_kernels_semi.cu | 3 ++- cpp/src/join/mixed_join_size_kernel.cuh | 28 ++++++++++++------------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 0fc1c3718b1..ea59f23c77f 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -38,7 +39,7 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ +CUDF_HIDDEN __launch_bounds__(block_size) __global__ void mixed_join(table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 01e3fe09b38..1f31eaa7878 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,7 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ +CUDF_HIDDEN __launch_bounds__(block_size) __global__ void mixed_join_semi(table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 618e7a9082e..00a90f8273f 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -35,20 +36,19 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ - void compute_mixed_join_output_size( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) +CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + std::size_t* output_size, + cudf::device_span matches_per_row) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest