Remove unneeded methods in Column (#14730)

* `valid_count` can be composed of `null_count` or where checked `has_nulls` * `contains_na_entries` is redundant with `has_nulls` * Better typing in `searchsorted` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #14730
rapidsai · Jan 10, 2024 · 3f19d04 · 3f19d04
1 parent fa37e13
commit 3f19d04
Show file tree

Hide file tree

Showing 11 changed files with 52 additions and 43 deletions.
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -2,11 +2,10 @@
 
 from __future__ import annotations
 
-import builtins
 import pickle
 import warnings
 from functools import cached_property
-from typing import Any, Set, Tuple
+from typing import Any, Literal, Set, Tuple
 
 import pandas as pd
 from typing_extensions import Self
@@ -1702,6 +1701,8 @@ def find_label_range(self, loc: slice) -> slice:
         start = loc.start
         stop = loc.stop
         step = 1 if loc.step is None else loc.step
+        start_side: Literal["left", "right"]
+        stop_side: Literal["left", "right"]
         if step < 0:
             start_side, stop_side = "right", "left"
         else:
@@ -1725,9 +1726,9 @@ def find_label_range(self, loc: slice) -> slice:
     def searchsorted(
         self,
         value,
-        side: builtins.str = "left",
+        side: Literal["left", "right"] = "left",
         ascending: bool = True,
-        na_position: builtins.str = "last",
+        na_position: Literal["first", "last"] = "last",
     ):
         """Find index where elements should be inserted to maintain order
 
@@ -1754,7 +1755,12 @@ def searchsorted(
         """
         raise NotImplementedError
 
-    def get_slice_bound(self, label, side: builtins.str, kind=None) -> int:
+    def get_slice_bound(
+        self,
+        label,
+        side: Literal["left", "right"],
+        kind: Literal["ix", "loc", "getitem", None] = None,
+    ) -> int:
         """
         Calculate slice bound that corresponds to given label.
         Returns leftmost (one-past-the-rightmost if ``side=='right'``) position

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -1372,7 +1372,9 @@ def _concat(
         # improved as the concatenation API is solidified.
 
         # Find the first non-null column:
-        head = next((obj for obj in objs if obj.valid_count), objs[0])
+        head = next(
+            (obj for obj in objs if not obj.null_count != len(obj)), objs[0]
+        )
 
         # Combine and de-dupe the categories
         cats = column.concat_columns([o.categories for o in objs]).unique()

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -12,6 +12,7 @@
     Any,
     Dict,
     List,
+    Literal,
     MutableSequence,
     Optional,
     Sequence,
@@ -428,11 +429,6 @@ def _fill(
     def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
         return libcudf.copying.shift(self, offset, fill_value)
 
-    @property
-    def valid_count(self) -> int:
-        """Number of non-null values"""
-        return len(self) - self.null_count
-
     @property
     def nullmask(self) -> Buffer:
         """The gpu buffer for the null-mask"""
@@ -1159,9 +1155,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     def searchsorted(
         self,
         value,
-        side: str = "left",
+        side: Literal["left", "right"] = "left",
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ) -> Self:
         if not isinstance(value, ColumnBase) or value.dtype != self.dtype:
             raise ValueError(
@@ -1304,10 +1300,6 @@ def _reduce(
             return libcudf.reduce.reduce(op, preprocessed, **kwargs)
         return preprocessed
 
-    @property
-    def contains_na_entries(self) -> bool:
-        return self.null_count != 0
-
     def _process_for_reduction(
         self, skipna: Optional[bool] = None, min_count: int = 0
     ) -> Union[ColumnBase, ScalarLike]:
@@ -2742,7 +2734,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # If all columns are `NumericalColumn` with different dtypes,
     # we cast them to a common dtype.
     # Notice, we can always cast pure null columns
-    not_null_col_dtypes = [o.dtype for o in objs if o.valid_count]
+    not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
         _is_non_decimal_numeric_dtype(dtyp)
         and np.issubdtype(dtyp, np.datetime64)
@@ -2754,13 +2746,13 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         objs = [obj.astype(common_dtype) for obj in objs]
 
     # Find the first non-null column:
-    head = next((obj for obj in objs if obj.valid_count), objs[0])
+    head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0])
 
     for i, obj in enumerate(objs):
         # Check that all columns are the same type:
         if not is_dtype_equal(obj.dtype, head.dtype):
             # if all null, cast to appropriate dtype
-            if obj.valid_count == 0:
+            if obj.null_count == len(obj):
                 objs[i] = column_empty_like(
                     head, dtype=head.dtype, masked=True, newsize=len(obj)
                 )

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -140,7 +140,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
         else:
             return super().indices_of(value)
 
-    def has_nulls(self, include_nan=False):
+    def has_nulls(self, include_nan: bool = False) -> bool:
         return bool(self.null_count != 0) or (
             include_nan and bool(self.nan_count != 0)
         )
@@ -425,10 +425,6 @@ def dropna(self, drop_nan: bool = False) -> NumericalColumn:
         col = self.nans_to_nulls() if drop_nan else self
         return drop_nulls([col])[0]
 
-    @property
-    def contains_na_entries(self) -> bool:
-        return (self.nan_count != 0) or (self.null_count != 0)
-
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -5997,9 +5997,14 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
             raise NotImplementedError("Only axis=0 is currently supported.")
-
+        length = len(self)
         return Series._from_data(
-            {None: [self._data[col].valid_count for col in self._data.names]},
+            {
+                None: [
+                    length - self._data[col].null_count
+                    for col in self._data.names
+                ]
+            },
             as_index(self._data.names),
         )
 
@@ -8091,7 +8096,7 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns):
             # non-null Column with the same name is found.
             if idx not in dtypes:
                 dtypes[idx] = cols[idx].dtype
-            if cols[idx].valid_count > 0:
+            if cols[idx].null_count != len(cols[idx]):
                 if idx not in non_null_columns:
                     non_null_columns[idx] = [cols[idx]]
                 else:

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -13,6 +13,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Tuple,
@@ -882,7 +883,7 @@ def fillna(
                 replace_val = None
             should_fill = (
                 col_name in value
-                and col.contains_na_entries
+                and col.has_nulls(include_nan=True)
                 and not libcudf.scalar._is_null_host_scalar(replace_val)
             ) or method is not None
             if should_fill:
@@ -1354,7 +1355,11 @@ def notna(self):
 
     @_cudf_nvtx_annotate
     def searchsorted(
-        self, values, side="left", ascending=True, na_position="last"
+        self,
+        values,
+        side: Literal["left", "right"] = "left",
+        ascending: bool = True,
+        na_position: Literal["first", "last"] = "last",
     ):
         """Find indices where elements should be inserted to maintain order
 

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -11,6 +11,7 @@
     Any,
     Dict,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Sequence,
@@ -233,9 +234,9 @@ def _copy_type_metadata(
     def searchsorted(
         self,
         value: int,
-        side: str = "left",
+        side: Literal["left", "right"] = "left",
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ):
         assert (len(self) <= 1) or (
             ascending == (self._step > 0)
@@ -2205,9 +2206,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
     def searchsorted(
         self,
         value,
-        side: str = "left",
+        side: Literal["left", "right"] = "left",
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ):
         value = self.dtype.type(value)
         return super().searchsorted(

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1710,7 +1710,7 @@ def _concat(cls, objs, axis=0, index=True):
     @_cudf_nvtx_annotate
     def valid_count(self):
         """Number of non-null values"""
-        return self._column.valid_count
+        return len(self) - self._column.null_count
 
     @property  # type: ignore
     @_cudf_nvtx_annotate

diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import operator
 import string
@@ -217,7 +217,7 @@ def test_categorical_masking():
     got_masked = sr[got_matches]
 
     assert len(expect_masked) == len(got_masked)
-    assert len(expect_masked) == got_masked.valid_count
+    assert got_masked.null_count == 0
     assert_eq(got_masked, expect_masked)
 
 

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -812,7 +812,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
 
     if "number_of_values" in file_stats[0][col]:
         stats_valid_count = file_stats[0][col]["number_of_values"]
-        actual_valid_count = gdf[col].valid_count
+        actual_valid_count = len(gdf[col]) - gdf[col].null_count
         assert normalized_equals(actual_valid_count, stats_valid_count)
 
     # compare stripe statistics with actual min/max
@@ -827,7 +827,9 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
             assert normalized_equals(actual_true_count, stats_true_count)
 
         if "number_of_values" in stripes_stats[stripe_idx][col]:
-            actual_valid_count = stripe_df[col].valid_count
+            actual_valid_count = (
+                len(stripe_df[col]) - stripe_df[col].null_count
+            )
             stats_valid_count = stripes_stats[stripe_idx][col][
                 "number_of_values"
             ]

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import datetime
 from collections import namedtuple
@@ -401,7 +401,7 @@ def min_column_type(x, expected_type):
 
     if not isinstance(x, cudf.core.column.NumericalColumn):
         raise TypeError("Argument x must be of type column.NumericalColumn")
-    if x.valid_count == 0:
+    if x.null_count == len(x):
         return x.dtype
 
     if np.issubdtype(x.dtype, np.floating):