Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ArrayManager] Array version of putmask logic #44396

Closed
50 changes: 24 additions & 26 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
new_block,
to_native_types,
)
from pandas.core.internals.methods import putmask_flexible

if TYPE_CHECKING:
from pandas import Float64Index
Expand Down Expand Up @@ -190,7 +191,7 @@ def __repr__(self) -> str:
def apply(
self: T,
f,
align_keys: list[str] | None = None,
align_keys: list[str] | None = None, # not used for ArrayManager
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not worth having a separate one for AM?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean exactly? Separate signature for ArrayManager?
I initially remove this from the signature (since it is not used), but we have this method as an abstract method on the base class manager, and thus it gives typing errors if I remove it here.

ignore_failures: bool = False,
**kwargs,
) -> T:
Expand All @@ -201,7 +202,6 @@ def apply(
----------
f : str or callable
Name of the Array method to apply.
align_keys: List[str] or None, default None
ignore_failures: bool, default False
**kwargs
Keywords to pass to `f`
Expand All @@ -212,32 +212,14 @@ def apply(
"""
assert "filter" not in kwargs

align_keys = align_keys or []
result_arrays: list[np.ndarray] = []
result_indices: list[int] = []
# fillna: Series/DataFrame is responsible for making sure value is aligned

aligned_args = {k: kwargs[k] for k in align_keys}

if f == "apply":
f = kwargs.pop("func")

for i, arr in enumerate(self.arrays):

if aligned_args:

for k, obj in aligned_args.items():
if isinstance(obj, (ABCSeries, ABCDataFrame)):
# The caller is responsible for ensuring that
# obj.axes[-1].equals(self.items)
if obj.ndim == 1:
kwargs[k] = obj.iloc[i]
else:
kwargs[k] = obj.iloc[:, i]._values
else:
# otherwise we have an array-like
kwargs[k] = obj[i]

try:
if callable(f):
applied = f(arr, **kwargs)
Expand Down Expand Up @@ -352,12 +334,28 @@ def putmask(self, mask, new, align: bool = True):
align_keys = ["mask"]
new = extract_array(new, extract_numpy=True)

return self.apply_with_block(
"putmask",
align_keys=align_keys,
mask=mask,
new=new,
)
kwargs = {"mask": mask, "new": new}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not feasible to go through 'apply'?

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Nov 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. So I forgot that ArrayManager.apply indeed already has this "align" logic as well. However, that is not used at the moment (only putmask and where define align keys, and those both still go through apply_with_block at the moment; this is also confirmed by codecov that it is unused).

But then I would rather remove it from apply, instead of going through apply, which will also prevent duplication. The reason for doing it in putmask: 1) apply is already quite complex (eg also dealing with ignoring failures), so being able to remove the "alignment" handling there would be nice (can still share this later with where), and 2) putmask can update the existing arrays, while apply always constructs a new manager, and 3) for the CoW branch I need to add copy-on-write logic to putmask, which is not needed for apply in general.

aligned_kwargs = {k: kwargs[k] for k in align_keys}

for i, arr in enumerate(self.arrays):

for k, obj in aligned_kwargs.items():
if isinstance(obj, (ABCSeries, ABCDataFrame)):
# The caller is responsible for ensuring that
# obj.axes[-1].equals(self.items)
if obj.ndim == 1:
kwargs[k] = obj._values
else:
kwargs[k] = obj.iloc[:, i]._values
else:
# otherwise we have an ndarray
if self.ndim == 2:
kwargs[k] = obj[i]

new = putmask_flexible(arr, **kwargs)
self.arrays[i] = new

return self

def diff(self: T, n: int, axis: int) -> T:
if axis == 1:
Expand Down
26 changes: 2 additions & 24 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
is_empty_indexer,
is_scalar_indexer,
)
from pandas.core.internals.methods import putmask_flexible_ea
import pandas.core.missing as missing

if TYPE_CHECKING:
Expand Down Expand Up @@ -1417,30 +1418,7 @@ def putmask(self, mask, new) -> list[Block]:
"""
See Block.putmask.__doc__
"""
mask = extract_bool_array(mask)

new_values = self.values

if mask.ndim == new_values.ndim + 1:
# TODO(EA2D): unnecessary with 2D EAs
mask = mask.reshape(new_values.shape)

try:
# Caller is responsible for ensuring matching lengths
new_values._putmask(mask, new)
except TypeError:
if not is_interval_dtype(self.dtype):
# Discussion about what we want to support in the general
# case GH#39584
raise

blk = self.coerce_to_target_dtype(new)
if blk.dtype == _dtype_obj:
# For now at least, only support casting e.g.
# Interval[int64]->Interval[float64],
raise
return blk.putmask(mask, new)

new_values = putmask_flexible_ea(self.values, mask, new)
nb = type(self)(new_values, placement=self._mgr_locs, ndim=self.ndim)
return [nb]

Expand Down
118 changes: 118 additions & 0 deletions pandas/core/internals/methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
Wrappers around array_algos with internals-specific logic
"""
from __future__ import annotations

import numpy as np

from pandas.core.dtypes.cast import (
can_hold_element,
find_common_type,
infer_dtype_from,
)
from pandas.core.dtypes.common import is_interval_dtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCIndex,
ABCSeries,
)
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
na_value_for_dtype,
)

from pandas.core.array_algos.putmask import (
extract_bool_array,
putmask_smart,
putmask_without_repeat,
setitem_datetimelike_compat,
validate_putmask,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray


def putmask_flexible(array: np.ndarray | ExtensionArray, mask, new):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is all of this logic from? this doesn't look like 'simple' wrappers to me.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's logic that for the BlockManager lives on the blocks (the Block.putmask version also calls those putmask_without_repeat and putmask_smart helpers from core.array_algos.putmask

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great can u then remove from where it's used now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The BlockManager is still used ..
I know this gives some duplication, but the parts that can be shared are already in pandas.core.array_algos.putmask. Both the Block version as this version are using those functions.

I can try to see if there are some more parts that could be moved into array_algos.putmask

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, so I could easily remove most of the logic of ExtensionBlock.putmask and reuse the new helper function. For base Block.putmask that will be more difficult, as that directly involves splitting blocks if needed etc (so Block-specific logic)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let me put this another way. you are adding a ton of code to methods, is this generally useful? can we reuse elsewhere?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are adding a ton of code to methods, is this generally useful? can we reuse elsewhere?

No, it is only for use within array_manager.py, but I put it in a separate internals/methods.py file to not make array_manager.py any longer (and on request of Brock I also didn't put it in /core/array_algos/putmask.py because this is the custom putmask logic for internals (with fallback logic etc that we need for Series/DataFrame), and not a general putmask algo)

"""
Putmask implementation for ArrayManager.putmask.

Flexible version that will upcast if needed.
"""
if isinstance(array, np.ndarray):
return putmask_flexible_ndarray(array, mask=mask, new=new)
else:
return putmask_flexible_ea(array, mask=mask, new=new)


def putmask_flexible_ndarray(array: np.ndarray, mask, new):
"""
Putmask implementation for ArrayManager putmask for ndarray.

Flexible version that will upcast if needed.
"""
mask, noop = validate_putmask(array, mask)
assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame))

# if we are passed a scalar None, convert it here
if not array.dtype == "object" and is_valid_na_for_dtype(new, array.dtype):
new = na_value_for_dtype(array.dtype, compat=False)

if can_hold_element(array, new):
putmask_without_repeat(array, mask, new)
return array

elif noop:
return array

dtype, _ = infer_dtype_from(new)
if dtype.kind in ["m", "M"]:
array = array.astype(object)
# convert to list to avoid numpy coercing datetimelikes to integers
new = setitem_datetimelike_compat(array, mask.sum(), new)
# putmask_smart below converts it back to array
np.putmask(array, mask, new)
return array

new_values = putmask_smart(array, mask, new)
return new_values


def _coerce_to_target_dtype(array, new):
dtype, _ = infer_dtype_from(new, pandas_dtype=True)
new_dtype = find_common_type([array.dtype, dtype])
return array.astype(new_dtype, copy=False)


def putmask_flexible_ea(array: ExtensionArray, mask, new):
"""
Putmask implementation for ArrayManager putmask for EA.

Flexible version that will upcast if needed.
"""
mask = extract_bool_array(mask)

if mask.ndim == array.ndim + 1:
# TODO(EA2D): unnecessary with 2D EAs
mask = mask.reshape(array.shape)

if isinstance(array, NDArrayBackedExtensionArray):
if not can_hold_element(array, new):
array = _coerce_to_target_dtype(array, new)
return putmask_flexible(array, mask, new)

try:
array._putmask(mask, new)
except TypeError:
if not is_interval_dtype(array.dtype):
# Discussion about what we want to support in the general
# case GH#39584
raise

array = _coerce_to_target_dtype(array, new)
if array.dtype == np.dtype("object"):
# For now at least, only support casting e.g.
# Interval[int64]->Interval[float64],
raise
return putmask_flexible_ea(array, mask, new)

return array
1 change: 1 addition & 0 deletions pandas/tests/internals/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_namespace():
"blocks",
"concat",
"managers",
"methods",
"construction",
"array_manager",
"base",
Expand Down