ENH: 2D support for MaskedArray (#38992)

pandas-dev · Oct 16, 2021 · 4d9b6f7 · 4d9b6f7
1 parent 0638f7f
commit 4d9b6f7
Show file tree

Hide file tree

Showing 16 changed files with 374 additions and 93 deletions.
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -637,7 +637,7 @@ def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None):
+def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None):
     cdef:
         Py_ssize_t i, j, N, K
         numeric_object_t val
@@ -656,10 +656,11 @@ def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limi
         val = values[j, 0]
         for i in range(N):
             if mask[j, i]:
-                if fill_count >= lim:
+                if fill_count >= lim or i == 0:
                     continue
                 fill_count += 1
                 values[j, i] = val
+                mask[j, i] = False
             else:
                 fill_count = 0
                 val = values[j, i]
@@ -759,7 +760,7 @@ def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
 
 
 def backfill_2d_inplace(numeric_object_t[:, :] values,
-                        const uint8_t[:, :] mask,
+                        uint8_t[:, :] mask,
                         limit=None):
     pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)
 

diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -3,7 +3,10 @@
 for missing values.
 """
 
-from typing import Callable
+from typing import (
+    Callable,
+    Optional,
+)
 
 import numpy as np
 
@@ -19,6 +22,7 @@ def _sumprod(
     *,
     skipna: bool = True,
     min_count: int = 0,
+    axis: Optional[int] = None,
 ):
     """
     Sum or product for 1D masked array.
@@ -36,36 +40,55 @@ def _sumprod(
     min_count : int, default 0
         The required number of valid values to perform the operation. If fewer than
         ``min_count`` non-NA values are present the result will be NA.
+    axis : int, optional, default None
     """
     if not skipna:
-        if mask.any() or check_below_min_count(values.shape, None, min_count):
+        if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
             return libmissing.NA
         else:
-            return func(values)
+            return func(values, axis=axis)
     else:
-        if check_below_min_count(values.shape, mask, min_count):
+        if check_below_min_count(values.shape, mask, min_count) and (
+            axis is None or values.ndim == 1
+        ):
             return libmissing.NA
-        return func(values, where=~mask)
+
+        return func(values, where=~mask, axis=axis)
 
 
 def sum(
-    values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
+    values: np.ndarray,
+    mask: np.ndarray,
+    *,
+    skipna: bool = True,
+    min_count: int = 0,
+    axis: Optional[int] = None,
 ):
     return _sumprod(
-        np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
+        np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
     )
 
 
 def prod(
-    values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
+    values: np.ndarray,
+    mask: np.ndarray,
+    *,
+    skipna: bool = True,
+    min_count: int = 0,
+    axis: Optional[int] = None,
 ):
     return _sumprod(
-        np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
+        np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
     )
 
 
 def _minmax(
-    func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True
+    func: Callable,
+    values: np.ndarray,
+    mask: np.ndarray,
+    *,
+    skipna: bool = True,
+    axis: Optional[int] = None,
 ):
     """
     Reduction for 1D masked array.
@@ -80,6 +103,7 @@ def _minmax(
         Boolean numpy array (True values indicate missing values).
     skipna : bool, default True
         Whether to skip NA.
+    axis : int, optional, default None
     """
     if not skipna:
         if mask.any() or not values.size:
@@ -96,14 +120,27 @@ def _minmax(
             return libmissing.NA
 
 
-def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
-    return _minmax(np.min, values=values, mask=mask, skipna=skipna)
+def min(
+    values: np.ndarray,
+    mask: np.ndarray,
+    *,
+    skipna: bool = True,
+    axis: Optional[int] = None,
+):
+    return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
 
 
-def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
-    return _minmax(np.max, values=values, mask=mask, skipna=skipna)
+def max(
+    values: np.ndarray,
+    mask: np.ndarray,
+    *,
+    skipna: bool = True,
+    axis: Optional[int] = None,
+):
+    return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
 
 
+# TODO: axis kwarg
 def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
     if not values.size or mask.all():
         return libmissing.NA

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -298,27 +298,6 @@ def _wrap_reduction_result(self, axis: int | None, result):
             return self._box_func(result)
         return self._from_backing_data(result)
 
-    # ------------------------------------------------------------------------
-
-    def __repr__(self) -> str:
-        if self.ndim == 1:
-            return super().__repr__()
-
-        from pandas.io.formats.printing import format_object_summary
-
-        # the short repr has no trailing newline, while the truncated
-        # repr does. So we include a newline in our template, and strip
-        # any trailing newlines from format_object_summary
-        lines = [
-            format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
-                ", \n"
-            )
-            for x in self
-        ]
-        data = ",\n".join(lines)
-        class_name = f"<{type(self).__name__}>"
-        return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"
-
     # ------------------------------------------------------------------------
     # __array_function__ methods
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1209,6 +1209,9 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
     # ------------------------------------------------------------------------
 
     def __repr__(self) -> str:
+        if self.ndim > 1:
+            return self._repr_2d()
+
         from pandas.io.formats.printing import format_object_summary
 
         # the short repr has no trailing newline, while the truncated
@@ -1220,6 +1223,22 @@ def __repr__(self) -> str:
         class_name = f"<{type(self).__name__}>\n"
         return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"
 
+    def _repr_2d(self) -> str:
+        from pandas.io.formats.printing import format_object_summary
+
+        # the short repr has no trailing newline, while the truncated
+        # repr does. So we include a newline in our template, and strip
+        # any trailing newlines from format_object_summary
+        lines = [
+            format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
+                ", \n"
+            )
+            for x in self
+        ]
+        data = ",\n".join(lines)
+        class_name = f"<{type(self).__name__}>"
+        return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"
+
     def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
         """
         Formatting function for scalar values.

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -21,6 +21,7 @@
     npt,
     type_t,
 )
+from pandas.compat.numpy import function as nv
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -245,10 +246,8 @@ def coerce_to_array(
             if mask_values is not None:
                 mask = mask | mask_values
 
-    if values.ndim != 1:
-        raise ValueError("values must be a 1D list-like")
-    if mask.ndim != 1:
-        raise ValueError("mask must be a 1D list-like")
+    if values.shape != mask.shape:
+        raise ValueError("values.shape and mask.shape must match")
 
     return values, mask
 
@@ -447,6 +446,144 @@ def _values_for_argsort(self) -> np.ndarray:
         data[self._mask] = -1
         return data
 
+    def any(self, *, skipna: bool = True, axis: int | None = 0, **kwargs):
+        """
+        Return whether any element is True.
+
+        Returns False unless there is at least one element that is True.
+        By default, NAs are skipped. If ``skipna=False`` is specified and
+        missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
+        is used as for logical operations.
+
+        Parameters
+        ----------
+        skipna : bool, default True
+            Exclude NA values. If the entire array is NA and `skipna` is
+            True, then the result will be False, as for an empty array.
+            If `skipna` is False, the result will still be True if there is
+            at least one element that is True, otherwise NA will be returned
+            if there are NA's present.
+        axis : int or None, default 0
+        **kwargs : any, default None
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        bool or :attr:`pandas.NA`
+
+        See Also
+        --------
+        numpy.any : Numpy version of this method.
+        BooleanArray.all : Return whether all elements are True.
+
+        Examples
+        --------
+        The result indicates whether any element is True (and by default
+        skips NAs):
+
+        >>> pd.array([True, False, True]).any()
+        True
+        >>> pd.array([True, False, pd.NA]).any()
+        True
+        >>> pd.array([False, False, pd.NA]).any()
+        False
+        >>> pd.array([], dtype="boolean").any()
+        False
+        >>> pd.array([pd.NA], dtype="boolean").any()
+        False
+
+        With ``skipna=False``, the result can be NA if this is logically
+        required (whether ``pd.NA`` is True or False influences the result):
+
+        >>> pd.array([True, False, pd.NA]).any(skipna=False)
+        True
+        >>> pd.array([False, False, pd.NA]).any(skipna=False)
+        <NA>
+        """
+        kwargs.pop("axis", None)
+        nv.validate_any((), kwargs)
+
+        values = self._data.copy()
+        np.putmask(values, self._mask, False)
+        result = values.any(axis=axis)
+
+        if skipna:
+            return result
+        else:
+            if result or self.size == 0 or not self._mask.any():
+                return result
+            else:
+                return self.dtype.na_value
+
+    def all(self, *, skipna: bool = True, axis: int | None = 0, **kwargs):
+        """
+        Return whether all elements are True.
+
+        Returns True unless there is at least one element that is False.
+        By default, NAs are skipped. If ``skipna=False`` is specified and
+        missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
+        is used as for logical operations.
+
+        Parameters
+        ----------
+        skipna : bool, default True
+            Exclude NA values. If the entire array is NA and `skipna` is
+            True, then the result will be True, as for an empty array.
+            If `skipna` is False, the result will still be False if there is
+            at least one element that is False, otherwise NA will be returned
+            if there are NA's present.
+        axis : int or None, default 0
+        **kwargs : any, default None
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        bool or :attr:`pandas.NA`
+
+        See Also
+        --------
+        numpy.all : Numpy version of this method.
+        BooleanArray.any : Return whether any element is True.
+
+        Examples
+        --------
+        The result indicates whether any element is True (and by default
+        skips NAs):
+
+        >>> pd.array([True, True, pd.NA]).all()
+        True
+        >>> pd.array([True, False, pd.NA]).all()
+        False
+        >>> pd.array([], dtype="boolean").all()
+        True
+        >>> pd.array([pd.NA], dtype="boolean").all()
+        True
+
+        With ``skipna=False``, the result can be NA if this is logically
+        required (whether ``pd.NA`` is True or False influences the result):
+
+        >>> pd.array([True, True, pd.NA]).all(skipna=False)
+        <NA>
+        >>> pd.array([True, False, pd.NA]).all(skipna=False)
+        False
+        """
+        kwargs.pop("axis", None)
+        nv.validate_all((), kwargs)
+
+        values = self._data.copy()
+        np.putmask(values, self._mask, True)
+        result = values.all(axis=axis)
+
+        if skipna:
+            return result
+        else:
+            if not result or self.size == 0 or not self._mask.any():
+                return result
+            else:
+                return self.dtype.na_value
+
     def _logical_method(self, other, op):
 
         assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}