Skip to content

Commit

Permalink
ENH: 2D support for MaskedArray (#38992)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Oct 16, 2021
1 parent 0638f7f commit 4d9b6f7
Show file tree
Hide file tree
Showing 16 changed files with 374 additions and 93 deletions.
7 changes: 4 additions & 3 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):

@cython.boundscheck(False)
@cython.wraparound(False)
def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None):
def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None):
cdef:
Py_ssize_t i, j, N, K
numeric_object_t val
Expand All @@ -656,10 +656,11 @@ def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limi
val = values[j, 0]
for i in range(N):
if mask[j, i]:
if fill_count >= lim:
if fill_count >= lim or i == 0:
continue
fill_count += 1
values[j, i] = val
mask[j, i] = False
else:
fill_count = 0
val = values[j, i]
Expand Down Expand Up @@ -759,7 +760,7 @@ def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):


def backfill_2d_inplace(numeric_object_t[:, :] values,
const uint8_t[:, :] mask,
uint8_t[:, :] mask,
limit=None):
pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)

Expand Down
65 changes: 51 additions & 14 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
for missing values.
"""

from typing import Callable
from typing import (
Callable,
Optional,
)

import numpy as np

Expand All @@ -19,6 +22,7 @@ def _sumprod(
*,
skipna: bool = True,
min_count: int = 0,
axis: Optional[int] = None,
):
"""
Sum or product for 1D masked array.
Expand All @@ -36,36 +40,55 @@ def _sumprod(
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or check_below_min_count(values.shape, None, min_count):
if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values)
return func(values, axis=axis)
else:
if check_below_min_count(values.shape, mask, min_count):
if check_below_min_count(values.shape, mask, min_count) and (
axis is None or values.ndim == 1
):
return libmissing.NA
return func(values, where=~mask)

return func(values, where=~mask, axis=axis)


def sum(
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
min_count: int = 0,
axis: Optional[int] = None,
):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)


def prod(
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
min_count: int = 0,
axis: Optional[int] = None,
):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)


def _minmax(
func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True
func: Callable,
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
axis: Optional[int] = None,
):
"""
Reduction for 1D masked array.
Expand All @@ -80,6 +103,7 @@ def _minmax(
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or not values.size:
Expand All @@ -96,14 +120,27 @@ def _minmax(
return libmissing.NA


def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
return _minmax(np.min, values=values, mask=mask, skipna=skipna)
def min(
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
axis: Optional[int] = None,
):
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)


def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
return _minmax(np.max, values=values, mask=mask, skipna=skipna)
def max(
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
axis: Optional[int] = None,
):
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)


# TODO: axis kwarg
def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
if not values.size or mask.all():
return libmissing.NA
Expand Down
21 changes: 0 additions & 21 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,27 +298,6 @@ def _wrap_reduction_result(self, axis: int | None, result):
return self._box_func(result)
return self._from_backing_data(result)

# ------------------------------------------------------------------------

def __repr__(self) -> str:
if self.ndim == 1:
return super().__repr__()

from pandas.io.formats.printing import format_object_summary

# the short repr has no trailing newline, while the truncated
# repr does. So we include a newline in our template, and strip
# any trailing newlines from format_object_summary
lines = [
format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
", \n"
)
for x in self
]
data = ",\n".join(lines)
class_name = f"<{type(self).__name__}>"
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"

# ------------------------------------------------------------------------
# __array_function__ methods

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,9 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
# ------------------------------------------------------------------------

def __repr__(self) -> str:
if self.ndim > 1:
return self._repr_2d()

from pandas.io.formats.printing import format_object_summary

# the short repr has no trailing newline, while the truncated
Expand All @@ -1220,6 +1223,22 @@ def __repr__(self) -> str:
class_name = f"<{type(self).__name__}>\n"
return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"

def _repr_2d(self) -> str:
from pandas.io.formats.printing import format_object_summary

# the short repr has no trailing newline, while the truncated
# repr does. So we include a newline in our template, and strip
# any trailing newlines from format_object_summary
lines = [
format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
", \n"
)
for x in self
]
data = ",\n".join(lines)
class_name = f"<{type(self).__name__}>"
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"

def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
"""
Formatting function for scalar values.
Expand Down
145 changes: 141 additions & 4 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
npt,
type_t,
)
from pandas.compat.numpy import function as nv

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand Down Expand Up @@ -245,10 +246,8 @@ def coerce_to_array(
if mask_values is not None:
mask = mask | mask_values

if values.ndim != 1:
raise ValueError("values must be a 1D list-like")
if mask.ndim != 1:
raise ValueError("mask must be a 1D list-like")
if values.shape != mask.shape:
raise ValueError("values.shape and mask.shape must match")

return values, mask

Expand Down Expand Up @@ -447,6 +446,144 @@ def _values_for_argsort(self) -> np.ndarray:
data[self._mask] = -1
return data

def any(self, *, skipna: bool = True, axis: int | None = 0, **kwargs):
"""
Return whether any element is True.
Returns False unless there is at least one element that is True.
By default, NAs are skipped. If ``skipna=False`` is specified and
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
is used as for logical operations.
Parameters
----------
skipna : bool, default True
Exclude NA values. If the entire array is NA and `skipna` is
True, then the result will be False, as for an empty array.
If `skipna` is False, the result will still be True if there is
at least one element that is True, otherwise NA will be returned
if there are NA's present.
axis : int or None, default 0
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.
Returns
-------
bool or :attr:`pandas.NA`
See Also
--------
numpy.any : Numpy version of this method.
BooleanArray.all : Return whether all elements are True.
Examples
--------
The result indicates whether any element is True (and by default
skips NAs):
>>> pd.array([True, False, True]).any()
True
>>> pd.array([True, False, pd.NA]).any()
True
>>> pd.array([False, False, pd.NA]).any()
False
>>> pd.array([], dtype="boolean").any()
False
>>> pd.array([pd.NA], dtype="boolean").any()
False
With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
>>> pd.array([True, False, pd.NA]).any(skipna=False)
True
>>> pd.array([False, False, pd.NA]).any(skipna=False)
<NA>
"""
kwargs.pop("axis", None)
nv.validate_any((), kwargs)

values = self._data.copy()
np.putmask(values, self._mask, False)
result = values.any(axis=axis)

if skipna:
return result
else:
if result or self.size == 0 or not self._mask.any():
return result
else:
return self.dtype.na_value

def all(self, *, skipna: bool = True, axis: int | None = 0, **kwargs):
"""
Return whether all elements are True.
Returns True unless there is at least one element that is False.
By default, NAs are skipped. If ``skipna=False`` is specified and
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
is used as for logical operations.
Parameters
----------
skipna : bool, default True
Exclude NA values. If the entire array is NA and `skipna` is
True, then the result will be True, as for an empty array.
If `skipna` is False, the result will still be False if there is
at least one element that is False, otherwise NA will be returned
if there are NA's present.
axis : int or None, default 0
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.
Returns
-------
bool or :attr:`pandas.NA`
See Also
--------
numpy.all : Numpy version of this method.
BooleanArray.any : Return whether any element is True.
Examples
--------
The result indicates whether any element is True (and by default
skips NAs):
>>> pd.array([True, True, pd.NA]).all()
True
>>> pd.array([True, False, pd.NA]).all()
False
>>> pd.array([], dtype="boolean").all()
True
>>> pd.array([pd.NA], dtype="boolean").all()
True
With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
>>> pd.array([True, True, pd.NA]).all(skipna=False)
<NA>
>>> pd.array([True, False, pd.NA]).all(skipna=False)
False
"""
kwargs.pop("axis", None)
nv.validate_all((), kwargs)

values = self._data.copy()
np.putmask(values, self._mask, True)
result = values.all(axis=axis)

if skipna:
return result
else:
if not result or self.size == 0 or not self._mask.any():
return result
else:
return self.dtype.na_value

def _logical_method(self, other, op):

assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
Expand Down
Loading

0 comments on commit 4d9b6f7

Please sign in to comment.