From 3f19d04c1889e7114cbed9081359f3bdcb9f6e64 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jan 2024 11:11:28 -1000 Subject: [PATCH] Remove unneeded methods in Column (#14730) * `valid_count` can be composed of `null_count` or where checked `has_nulls` * `contains_na_entries` is redundant with `has_nulls` * Better typing in `searchsorted` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14730 --- python/cudf/cudf/core/_base_index.py | 16 +++++++++++----- python/cudf/cudf/core/column/categorical.py | 4 +++- python/cudf/cudf/core/column/column.py | 20 ++++++-------------- python/cudf/cudf/core/column/numerical.py | 6 +----- python/cudf/cudf/core/dataframe.py | 11 ++++++++--- python/cudf/cudf/core/frame.py | 9 +++++++-- python/cudf/cudf/core/index.py | 11 ++++++----- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_categorical.py | 4 ++-- python/cudf/cudf/tests/test_orc.py | 8 +++++--- python/cudf/cudf/utils/dtypes.py | 4 ++-- 11 files changed, 52 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8d2506403d4..2aef77b6c99 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2,11 +2,10 @@ from __future__ import annotations -import builtins import pickle import warnings from functools import cached_property -from typing import Any, Set, Tuple +from typing import Any, Literal, Set, Tuple import pandas as pd from typing_extensions import Self @@ -1702,6 +1701,8 @@ def find_label_range(self, loc: slice) -> slice: start = loc.start stop = loc.stop step = 1 if loc.step is None else loc.step + start_side: Literal["left", "right"] + stop_side: Literal["left", "right"] if step < 0: start_side, stop_side = "right", "left" else: @@ -1725,9 +1726,9 @@ def find_label_range(self, loc: slice) -> slice: def searchsorted( self, value, - side: builtins.str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: builtins.str = "last", + na_position: Literal["first", "last"] = "last", ): """Find index where elements should be inserted to maintain order @@ -1754,7 +1755,12 @@ def searchsorted( """ raise NotImplementedError - def get_slice_bound(self, label, side: builtins.str, kind=None) -> int: + def get_slice_bound( + self, + label, + side: Literal["left", "right"], + kind: Literal["ix", "loc", "getitem", None] = None, + ) -> int: """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f3f2be0cc45..59fd4631067 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1372,7 +1372,9 @@ def _concat( # improved as the concatenation API is solidified. # Find the first non-null column: - head = next((obj for obj in objs if obj.valid_count), objs[0]) + head = next( + (obj for obj in objs if not obj.null_count != len(obj)), objs[0] + ) # Combine and de-dupe the categories cats = column.concat_columns([o.categories for o in objs]).unique() diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 440ac855691..6af39dd3558 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -12,6 +12,7 @@ Any, Dict, List, + Literal, MutableSequence, Optional, Sequence, @@ -428,11 +429,6 @@ def _fill( def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: return libcudf.copying.shift(self, offset, fill_value) - @property - def valid_count(self) -> int: - """Number of non-null values""" - return len(self) - self.null_count - @property def nullmask(self) -> Buffer: """The gpu buffer for the null-mask""" @@ -1159,9 +1155,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def searchsorted( self, value, - side: str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ) -> Self: if not isinstance(value, ColumnBase) or value.dtype != self.dtype: raise ValueError( @@ -1304,10 +1300,6 @@ def _reduce( return libcudf.reduce.reduce(op, preprocessed, **kwargs) return preprocessed - @property - def contains_na_entries(self) -> bool: - return self.null_count != 0 - def _process_for_reduction( self, skipna: Optional[bool] = None, min_count: int = 0 ) -> Union[ColumnBase, ScalarLike]: @@ -2742,7 +2734,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns - not_null_col_dtypes = [o.dtype for o in objs if o.valid_count] + not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( _is_non_decimal_numeric_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) @@ -2754,13 +2746,13 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: objs = [obj.astype(common_dtype) for obj in objs] # Find the first non-null column: - head = next((obj for obj in objs if obj.valid_count), objs[0]) + head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0]) for i, obj in enumerate(objs): # Check that all columns are the same type: if not is_dtype_equal(obj.dtype, head.dtype): # if all null, cast to appropriate dtype - if obj.valid_count == 0: + if obj.null_count == len(obj): objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index e848c86897f..6ef3a6abacc 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -140,7 +140,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: else: return super().indices_of(value) - def has_nulls(self, include_nan=False): + def has_nulls(self, include_nan: bool = False) -> bool: return bool(self.null_count != 0) or ( include_nan and bool(self.nan_count != 0) ) @@ -425,10 +425,6 @@ def dropna(self, drop_nan: bool = False) -> NumericalColumn: col = self.nans_to_nulls() if drop_nan else self return drop_nulls([col])[0] - @property - def contains_na_entries(self) -> bool: - return (self.nan_count != 0) or (self.null_count != 0) - def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8dd81c92994..a3373951a06 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5997,9 +5997,14 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): axis = self._get_axis_from_axis_arg(axis) if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") - + length = len(self) return Series._from_data( - {None: [self._data[col].valid_count for col in self._data.names]}, + { + None: [ + length - self._data[col].null_count + for col in self._data.names + ] + }, as_index(self._data.names), ) @@ -8091,7 +8096,7 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): # non-null Column with the same name is found. if idx not in dtypes: dtypes[idx] = cols[idx].dtype - if cols[idx].valid_count > 0: + if cols[idx].null_count != len(cols[idx]): if idx not in non_null_columns: non_null_columns[idx] = [cols[idx]] else: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 123f13f8733..5f7a86e86d8 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -13,6 +13,7 @@ Callable, Dict, List, + Literal, MutableMapping, Optional, Tuple, @@ -882,7 +883,7 @@ def fillna( replace_val = None should_fill = ( col_name in value - and col.contains_na_entries + and col.has_nulls(include_nan=True) and not libcudf.scalar._is_null_host_scalar(replace_val) ) or method is not None if should_fill: @@ -1354,7 +1355,11 @@ def notna(self): @_cudf_nvtx_annotate def searchsorted( - self, values, side="left", ascending=True, na_position="last" + self, + values, + side: Literal["left", "right"] = "left", + ascending: bool = True, + na_position: Literal["first", "last"] = "last", ): """Find indices where elements should be inserted to maintain order diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6bc632e0a53..5c33cd09ad1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -11,6 +11,7 @@ Any, Dict, List, + Literal, MutableMapping, Optional, Sequence, @@ -233,9 +234,9 @@ def _copy_type_metadata( def searchsorted( self, value: int, - side: str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ): assert (len(self) <= 1) or ( ascending == (self._step > 0) @@ -2205,9 +2206,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): def searchsorted( self, value, - side: str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ): value = self.dtype.type(value) return super().searchsorted( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fcb4e77f6a5..8739a61dd8b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1710,7 +1710,7 @@ def _concat(cls, objs, axis=0, index=True): @_cudf_nvtx_annotate def valid_count(self): """Number of non-null values""" - return self._column.valid_count + return len(self) - self._column.null_count @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 49eeff01bee..52b7236b965 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import operator import string @@ -217,7 +217,7 @@ def test_categorical_masking(): got_masked = sr[got_matches] assert len(expect_masked) == len(got_masked) - assert len(expect_masked) == got_masked.valid_count + assert got_masked.null_count == 0 assert_eq(got_masked, expect_masked) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 7407da9c4ac..4630b6eef0a 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime import decimal @@ -812,7 +812,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): if "number_of_values" in file_stats[0][col]: stats_valid_count = file_stats[0][col]["number_of_values"] - actual_valid_count = gdf[col].valid_count + actual_valid_count = len(gdf[col]) - gdf[col].null_count assert normalized_equals(actual_valid_count, stats_valid_count) # compare stripe statistics with actual min/max @@ -827,7 +827,9 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): assert normalized_equals(actual_true_count, stats_true_count) if "number_of_values" in stripes_stats[stripe_idx][col]: - actual_valid_count = stripe_df[col].valid_count + actual_valid_count = ( + len(stripe_df[col]) - stripe_df[col].null_count + ) stats_valid_count = stripes_stats[stripe_idx][col][ "number_of_values" ] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index c65404445cb..72721b5197f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import datetime from collections import namedtuple @@ -401,7 +401,7 @@ def min_column_type(x, expected_type): if not isinstance(x, cudf.core.column.NumericalColumn): raise TypeError("Argument x must be of type column.NumericalColumn") - if x.valid_count == 0: + if x.null_count == len(x): return x.dtype if np.issubdtype(x.dtype, np.floating):