Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: 2D support for MaskedArray #38992

Merged
merged 57 commits into from
Oct 16, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
ae51cff
ENH: 2D support for MaskedArray
jbrockmendel Jan 6, 2021
f608792
Merge branch 'master' of https://github.com/pandas-dev/pandas into en…
jbrockmendel Jan 6, 2021
125606b
remove Any part of _mask annotation
jbrockmendel Jan 6, 2021
dd5dbbe
xfail for ArrowStringArray
jbrockmendel Jan 6, 2021
577826c
Merge branch 'master' of https://github.com/pandas-dev/pandas into en…
jbrockmendel Feb 3, 2021
17f63d4
absolute import
jbrockmendel Feb 3, 2021
33b2d78
Merge branch 'master' of https://github.com/pandas-dev/pandas into en…
jbrockmendel Feb 4, 2021
a2bd7b1
Merge branch 'master' of https://github.com/pandas-dev/pandas into en…
jbrockmendel Feb 5, 2021
3f14fa3
TST: reductions with axis
jbrockmendel Feb 5, 2021
6600588
Merge branch 'master' of https://github.com/pandas-dev/pandas into en…
jbrockmendel Feb 6, 2021
560279c
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 7, 2021
553038c
np_version_under1p17 compat
jbrockmendel Feb 7, 2021
b2a26bf
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 12, 2021
8a40d59
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 12, 2021
44999d1
xfail syntax
jbrockmendel Feb 12, 2021
7a6c226
typo fixup
jbrockmendel Feb 12, 2021
638bd9c
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 13, 2021
aca12e6
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 14, 2021
f27f8c0
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 15, 2021
3810660
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 19, 2021
f0957b3
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 22, 2021
f538868
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 23, 2021
6664d0d
isort fixup
jbrockmendel Feb 23, 2021
2792724
Merge branch 'master' into enh-masked-2d
jbrockmendel Feb 26, 2021
061a53c
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 2, 2021
6032ed1
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 3, 2021
543258d
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 6, 2021
e96ec33
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 9, 2021
6ca7f01
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 10, 2021
6f26c4b
Fix pad/backfill 2d
jbrockmendel Mar 12, 2021
3dedb8f
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 12, 2021
34fda97
typo fixup
jbrockmendel Mar 12, 2021
2a108ba
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 15, 2021
2c99e59
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 16, 2021
ee9c3a0
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 31, 2021
efd0071
Merge branch 'master' into enh-masked-2d
jbrockmendel Mar 31, 2021
3839b98
Merge branch 'master' into enh-masked-2d
jbrockmendel Apr 8, 2021
0956993
Merge branch 'master' into enh-masked-2d
jbrockmendel Apr 14, 2021
4b75101
comment
jbrockmendel Apr 14, 2021
639fc23
Merge branch 'master' into enh-masked-2d
jbrockmendel May 4, 2021
2989efc
Merge branch 'master' into enh-masked-2d
jbrockmendel May 9, 2021
81cd3e4
Merge branch 'master' into enh-masked-2d
jbrockmendel May 17, 2021
8f315bc
Merge branch 'master' into enh-masked-2d
jbrockmendel Aug 3, 2021
21cf578
fix broken tests
jbrockmendel Aug 4, 2021
6f215d6
Merge branch 'master' into enh-masked-2d
jbrockmendel Sep 28, 2021
e68c797
Merge branch 'master' into enh-masked-2d
jbrockmendel Sep 29, 2021
17dd19a
Merge branch 'master' into enh-masked-2d
jbrockmendel Oct 3, 2021
93d65eb
Merge branch 'master' into enh-masked-2d
jbrockmendel Oct 6, 2021
3bfe60c
comment
jbrockmendel Oct 6, 2021
5c28d69
Merge branch 'master' into enh-masked-2d
jbrockmendel Oct 8, 2021
92d710b
Merge branch 'master' of https://github.com/pandas-dev/pandas into en…
jbrockmendel Oct 10, 2021
5b014c1
troubleshoot windows build
jbrockmendel Oct 11, 2021
db76ca0
Merge branch 'master' into enh-masked-2d
jbrockmendel Oct 11, 2021
8148fcd
Merge branch 'master' into enh-masked-2d
jbrockmendel Oct 13, 2021
15a533f
troubleshoot 32bit builds
jbrockmendel Oct 13, 2021
7a7601e
troubleshoot 32bit builds
jbrockmendel Oct 14, 2021
7c6baaf
troubleshoot 32 bit builds
jbrockmendel Oct 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 48 additions & 15 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
for missing values.
"""

from typing import Callable
from typing import Callable, Optional

import numpy as np

Expand All @@ -20,6 +20,7 @@ def _sumprod(
*,
skipna: bool = True,
min_count: int = 0,
axis: Optional[int] = None,
):
"""
Sum or product for 1D masked array.
Expand All @@ -37,40 +38,58 @@ def _sumprod(
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or check_below_min_count(values.shape, None, min_count):
if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values)
return func(values, axis=axis)
else:
if check_below_min_count(values.shape, mask, min_count):
if check_below_min_count(values.shape, mask, min_count) and (
axis is None or values.ndim == 1
):
return libmissing.NA

if np_version_under1p17:
return func(values[~mask])
return func(values[~mask], axis=axis)
else:
return func(values, where=~mask)
return func(values, where=~mask, axis=axis)


def sum(
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
min_count: int = 0,
axis: Optional[int] = None,
):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)


def prod(
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
min_count: int = 0,
axis: Optional[int] = None,
):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)


def _minmax(
func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True
func: Callable,
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
axis: Optional[int] = None,
):
"""
Reduction for 1D masked array.
Expand All @@ -85,6 +104,7 @@ def _minmax(
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
axis : int, optional, default None
"""
if not skipna:
if mask.any() or not values.size:
Expand All @@ -101,14 +121,27 @@ def _minmax(
return libmissing.NA


def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
return _minmax(np.min, values=values, mask=mask, skipna=skipna)
def min(
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
axis: Optional[int] = None,
):
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)


def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
return _minmax(np.max, values=values, mask=mask, skipna=skipna)
def max(
values: np.ndarray,
mask: np.ndarray,
*,
skipna: bool = True,
axis: Optional[int] = None,
):
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there doc strings here? if so can you update (can be followup as well)



# TODO: axis kwarg
def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
if not values.size or mask.all():
return libmissing.NA
Expand Down
21 changes: 0 additions & 21 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,27 +301,6 @@ def _wrap_reduction_result(self, axis: Optional[int], result):
return self._box_func(result)
return self._from_backing_data(result)

# ------------------------------------------------------------------------
jreback marked this conversation as resolved.
Show resolved Hide resolved

def __repr__(self) -> str:
if self.ndim == 1:
return super().__repr__()

from pandas.io.formats.printing import format_object_summary

# the short repr has no trailing newline, while the truncated
# repr does. So we include a newline in our template, and strip
# any trailing newlines from format_object_summary
lines = [
format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
", \n"
)
for x in self
]
data = ",\n".join(lines)
class_name = f"<{type(self).__name__}>"
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"

# ------------------------------------------------------------------------
# __array_function__ methods

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,9 @@ def view(self, dtype: Optional[Dtype] = None) -> ArrayLike:
# ------------------------------------------------------------------------

def __repr__(self) -> str:
if self.ndim > 1:
return self._repr_2d()

from pandas.io.formats.printing import format_object_summary

# the short repr has no trailing newline, while the truncated
Expand All @@ -1108,6 +1111,22 @@ def __repr__(self) -> str:
class_name = f"<{type(self).__name__}>\n"
return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"

def _repr_2d(self) -> str:
from pandas.io.formats.printing import format_object_summary

# the short repr has no trailing newline, while the truncated
# repr does. So we include a newline in our template, and strip
# any trailing newlines from format_object_summary
lines = [
format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
", \n"
)
for x in self
]
data = ",\n".join(lines)
class_name = f"<{type(self).__name__}>"
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"

def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]:
"""
Formatting function for scalar values.
Expand Down
23 changes: 12 additions & 11 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,8 @@ def coerce_to_array(
if mask_values is not None:
mask = mask | mask_values

if values.ndim != 1:
raise ValueError("values must be a 1D list-like")
if mask.ndim != 1:
raise ValueError("mask must be a 1D list-like")
if values.shape != mask.shape:
raise ValueError("values.shape and mask.shape must match")

return values, mask

Expand Down Expand Up @@ -411,9 +409,9 @@ def _values_for_argsort(self) -> np.ndarray:
"""
data = self._data.copy()
data[self._mask] = -1
return data
return data.ravel("K")

def any(self, *, skipna: bool = True, **kwargs):
def any(self, *, skipna: bool = True, axis: Optional[int] = 0, **kwargs):
"""
Return whether any element is True.

Expand All @@ -430,6 +428,7 @@ def any(self, *, skipna: bool = True, **kwargs):
If `skipna` is False, the result will still be True if there is
at least one element that is True, otherwise NA will be returned
if there are NA's present.
axis : int or None, default 0
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.
Expand Down Expand Up @@ -472,16 +471,17 @@ def any(self, *, skipna: bool = True, **kwargs):

values = self._data.copy()
np.putmask(values, self._mask, False)
result = values.any()
result = values.any(axis=axis)

if skipna:
return result
else:
if result or len(self) == 0 or not self._mask.any():
if result or self.size == 0 or not self._mask.any():
return result
else:
return self.dtype.na_value

def all(self, *, skipna: bool = True, **kwargs):
def all(self, *, skipna: bool = True, axis: Optional[int] = 0, **kwargs):
"""
Return whether all elements are True.

Expand All @@ -498,6 +498,7 @@ def all(self, *, skipna: bool = True, **kwargs):
If `skipna` is False, the result will still be False if there is
at least one element that is False, otherwise NA will be returned
if there are NA's present.
axis : int or None, default 0
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.
Expand Down Expand Up @@ -538,12 +539,12 @@ def all(self, *, skipna: bool = True, **kwargs):

values = self._data.copy()
np.putmask(values, self._mask, True)
result = values.all()
result = values.all(axis=axis)

if skipna:
return result
else:
if not result or len(self) == 0 or not self._mask.any():
if not result or self.size == 0 or not self._mask.any():
return result
else:
return self.dtype.na_value
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/arrays/floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,21 +394,21 @@ def _cmp_method(self, other, op):

return BooleanArray(result, mask)

def sum(self, *, skipna=True, min_count=0, **kwargs):
def sum(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs):
nv.validate_sum((), kwargs)
return super()._reduce("sum", skipna=skipna, min_count=min_count)
return super()._reduce("sum", skipna=skipna, min_count=min_count, axis=axis)

def prod(self, *, skipna=True, min_count=0, **kwargs):
def prod(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs):
nv.validate_prod((), kwargs)
return super()._reduce("prod", skipna=skipna, min_count=min_count)
return super()._reduce("prod", skipna=skipna, min_count=min_count, axis=axis)

def min(self, *, skipna=True, **kwargs):
def min(self, *, skipna=True, axis: Optional[int] = 0, **kwargs):
nv.validate_min((), kwargs)
return super()._reduce("min", skipna=skipna)
return super()._reduce("min", skipna=skipna, axis=axis)

def max(self, *, skipna=True, **kwargs):
def max(self, *, skipna=True, axis: Optional[int] = 0, **kwargs):
nv.validate_max((), kwargs)
return super()._reduce("max", skipna=skipna)
return super()._reduce("max", skipna=skipna, axis=axis)

def _maybe_mask_result(self, result, mask, other, op_name: str):
"""
Expand Down
18 changes: 9 additions & 9 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def _values_for_argsort(self) -> np.ndarray:
data = self._data.copy()
if self._mask.any():
data[self._mask] = data.min() - 1
return data
return data.ravel("K")

def _cmp_method(self, other, op):
from pandas.core.arrays import BooleanArray
Expand Down Expand Up @@ -470,21 +470,21 @@ def _cmp_method(self, other, op):

return BooleanArray(result, mask)

def sum(self, *, skipna=True, min_count=0, **kwargs):
def sum(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs):
nv.validate_sum((), kwargs)
return super()._reduce("sum", skipna=skipna, min_count=min_count)
return super()._reduce("sum", skipna=skipna, min_count=min_count, axis=axis)

def prod(self, *, skipna=True, min_count=0, **kwargs):
def prod(self, *, skipna=True, min_count=0, axis: Optional[int] = 0, **kwargs):
nv.validate_prod((), kwargs)
return super()._reduce("prod", skipna=skipna, min_count=min_count)
return super()._reduce("prod", skipna=skipna, min_count=min_count, axis=axis)

def min(self, *, skipna=True, **kwargs):
def min(self, *, skipna=True, axis: Optional[int] = 0, **kwargs):
nv.validate_min((), kwargs)
return super()._reduce("min", skipna=skipna)
return super()._reduce("min", skipna=skipna, axis=axis)

def max(self, *, skipna=True, **kwargs):
def max(self, *, skipna=True, axis: Optional[int] = 0, **kwargs):
nv.validate_max((), kwargs)
return super()._reduce("max", skipna=skipna)
return super()._reduce("max", skipna=skipna, axis=axis)

def _maybe_mask_result(self, result, mask, other, op_name: str):
"""
Expand Down
Loading