Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: implement scalar ops blockwise #29853

Merged
merged 20 commits into from
Dec 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
15f0caa
REF: implement scalar ops blockwise
jbrockmendel Nov 26, 2019
08a43f0
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Nov 27, 2019
a765069
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Nov 29, 2019
c81ea13
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 8, 2019
c2f6129
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 9, 2019
016ae64
fix missing name
jbrockmendel Dec 9, 2019
4536097
revert
jbrockmendel Dec 9, 2019
798ce75
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 20, 2019
1fc1e3e
Fix numexpr tests
jbrockmendel Dec 21, 2019
657d1bb
ADD asv
jbrockmendel Dec 21, 2019
66d34c2
remove commented-out
jbrockmendel Dec 21, 2019
0f26775
Whatsnew
jbrockmendel Dec 21, 2019
a0e4adc
blackify
jbrockmendel Dec 21, 2019
23d5c48
isort fixup
jbrockmendel Dec 21, 2019
2228f5e
remoe asv params that fail in ci
jbrockmendel Dec 21, 2019
e230cea
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 24, 2019
2f80502
comment+docstring
jbrockmendel Dec 24, 2019
31607c0
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 26, 2019
0ec7e74
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 27, 2019
cf94d13
remove unreacahble
jbrockmendel Dec 27, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions asv_bench/benchmarks/binary_ops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import operator

import numpy as np

from pandas import DataFrame, Series, date_range
Expand All @@ -9,6 +11,36 @@
import pandas.computation.expressions as expr


class IntFrameWithScalar:
params = [
[np.float64, np.int64],
[2, 3.0, np.int32(4), np.float64(5)],
[
operator.add,
operator.sub,
operator.mul,
operator.truediv,
operator.floordiv,
operator.pow,
operator.mod,
operator.eq,
operator.ne,
operator.gt,
operator.ge,
operator.lt,
operator.le,
],
]
param_names = ["dtype", "scalar", "op"]

def setup(self, dtype, scalar, op):
arr = np.random.randn(20000, 100)
self.df = DataFrame(arr.astype(dtype))

def time_frame_op_with_scalar(self, dtype, scalar, op):
op(self.df, scalar)


class Ops:

params = [[True, False], ["default", 1]]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`)
- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
- Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`)
- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
Expand Down
31 changes: 27 additions & 4 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray)
_generate_range
"""

@property
def ndim(self) -> int:
return self._data.ndim

@property
def shape(self):
return self._data.shape

def reshape(self, *args, **kwargs):
# Note: we drop any freq
data = self._data.reshape(*args, **kwargs)
return type(self)(data, dtype=self.dtype)

def ravel(self, *args, **kwargs):
# Note: we drop any freq
data = self._data.ravel(*args, **kwargs)
return type(self)(data, dtype=self.dtype)

@property
def _box_func(self):
"""
Expand Down Expand Up @@ -413,7 +431,10 @@ def __getitem__(self, key):
getitem = self._data.__getitem__
if is_int:
val = getitem(key)
return self._box_func(val)
if lib.is_scalar(val):
# i.e. self.ndim == 1
return self._box_func(val)
return type(self)(val, dtype=self.dtype)

if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
Expand Down Expand Up @@ -823,6 +844,8 @@ def inferred_freq(self):
generated by infer_freq. Returns None if it can't autodetect the
frequency.
"""
if self.ndim != 1:
return None
try:
return frequencies.infer_freq(self)
except ValueError:
Expand Down Expand Up @@ -968,7 +991,7 @@ def _add_timedeltalike_scalar(self, other):
"""
if isna(other):
# i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
new_values = np.empty(len(self), dtype="i8")
new_values = np.empty(self.shape, dtype="i8")
new_values[:] = iNaT
return new_values

Expand Down Expand Up @@ -1014,7 +1037,7 @@ def _add_nat(self):

# GH#19124 pd.NaT is treated like a timedelta for both timedelta
# and datetime dtypes
result = np.zeros(len(self), dtype=np.int64)
result = np.zeros(self.shape, dtype=np.int64)
result.fill(iNaT)
return type(self)(result, dtype=self.dtype, freq=None)

Expand All @@ -1028,7 +1051,7 @@ def _sub_nat(self):
# For datetime64 dtypes by convention we treat NaT as a datetime, so
# this subtraction returns a timedelta64 dtype.
# For period dtype, timedelta64 is a close-enough return dtype.
result = np.zeros(len(self), dtype=np.int64)
result = np.zeros(self.shape, dtype=np.int64)
result.fill(iNaT)
return result.view("timedelta64[ns]")

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
" those."
)
raise ValueError(msg)
if values.ndim != 1:
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")

if values.dtype == "i8":
Expand Down Expand Up @@ -788,6 +788,9 @@ def _sub_datetime_arraylike(self, other):
return new_values.view("timedelta64[ns]")

def _add_offset(self, offset):
if self.ndim == 2:
return self.ravel()._add_offset(offset).reshape(self.shape)

assert not isinstance(offset, Tick)
try:
if self.tz is not None:
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
" TimedeltaArray ndarray, or Series or Index containing one of those."
)
raise ValueError(msg)
if values.ndim != 1:
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")

if values.dtype == "i8":
Expand Down Expand Up @@ -1036,8 +1036,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

data = np.array(data, copy=copy)
if data.ndim != 1:
raise ValueError("Only 1-dimensional input arrays are supported.")

assert data.dtype == "m8[ns]", data
return data, inferred_freq
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,19 @@ def apply(self, func, **kwargs):
"""
with np.errstate(all="ignore"):
result = func(self.values, **kwargs)

if is_extension_array_dtype(result) and result.ndim > 1:
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
nbs = []
for i, loc in enumerate(self.mgr_locs):
vals = result[i]
nv = _block_shape(vals, ndim=self.ndim)
block = self.make_block(values=nv, placement=[loc])
nbs.append(block)
return nbs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be an elif here and re-assign to result, just to make the flow more natural. alt could make this into a method on BM. but for followon's

if not isinstance(result, Block):
# Exclude the 0-dim case so we can do reductions
result = self.make_block(values=_block_shape(result, ndim=self.ndim))

return result
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,13 @@ def _verify_integrity(self):
f"tot_items: {tot_items}"
)

def apply(self, f: str, filter=None, **kwargs):
def apply(self, f, filter=None, **kwargs):
"""
Iterate over the blocks, collect and create a new BlockManager.

Parameters
----------
f : str
f : str or callable
Name of the Block method to apply.
filter : list, if supplied, only call the block if the filter is in
the block
Expand Down Expand Up @@ -411,7 +411,10 @@ def apply(self, f: str, filter=None, **kwargs):
axis = obj._info_axis_number
kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)

applied = getattr(b, f)(**kwargs)
if callable(f):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this strictly necessary? meaning happy to require only callables here (would require some changing)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all of our existing usages pass strings here to get at Block methods. i think @WillAyd had a suggestion about re-working Block.apply to do str vs callable handling there; that should be its own PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k, yeah this whole section could use some TLC

applied = b.apply(f, **kwargs)
else:
applied = getattr(b, f)(**kwargs)
result_blocks = _extend_blocks(applied, result_blocks)

if len(result_blocks) == 0:
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
arithmetic_op,
comparison_op,
define_na_arithmetic_op,
get_array_op,
logical_op,
)
from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401
Expand Down Expand Up @@ -372,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
right = lib.item_from_zerodim(right)
if lib.is_scalar(right) or np.ndim(right) == 0:

def column_op(a, b):
return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}
# Get the appropriate array-op to apply to each block's values.
array_op = get_array_op(func, str_rep=str_rep)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here on what is going on

bm = left._data.apply(array_op, right=right)
return type(left)(bm)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could just be an if (as you are returning), e.g. change the following elif to an if, but NBD

elif isinstance(right, ABCDataFrame):
assert right._indexed_same(left)
Expand Down Expand Up @@ -713,7 +716,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
if fill_value is not None:
self = self.fillna(fill_value)

new_data = dispatch_to_series(self, other, op)
new_data = dispatch_to_series(self, other, op, str_rep)
return self._construct_result(new_data)

f.__name__ = op_name
Expand Down
37 changes: 31 additions & 6 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
Functions for arithmetic and comparison operations on NumPy arrays and
ExtensionArrays.
"""
from functools import partial
import operator
from typing import Any, Union
from typing import Any, Optional, Union

import numpy as np

Expand Down Expand Up @@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, (ABCSeries, ABCIndex)):
y = y.values

result = libops.vec_compare(x, y, op)
result = libops.vec_compare(x.ravel(), y, op)
else:
result = libops.scalar_compare(x, y, op)
return result
result = libops.scalar_compare(x.ravel(), y, op)
return result.reshape(x.shape)


def masked_arith_op(x, y, op):
Expand Down Expand Up @@ -237,9 +238,9 @@ def comparison_op(
elif is_scalar(rvalues) and isna(rvalues):
# numpy does not like comparisons vs None
if op is operator.ne:
res_values = np.ones(len(lvalues), dtype=bool)
res_values = np.ones(lvalues.shape, dtype=bool)
else:
res_values = np.zeros(len(lvalues), dtype=bool)
res_values = np.zeros(lvalues.shape, dtype=bool)

elif is_object_dtype(lvalues.dtype):
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
Expand Down Expand Up @@ -367,3 +368,27 @@ def fill_bool(x, left=None):
res_values = filler(res_values) # type: ignore

return res_values


def get_array_op(op, str_rep: Optional[str] = None):
"""
Return a binary array operation corresponding to the given operator op.

Parameters
----------
op : function
Binary operator from operator or roperator module.
str_rep : str or None, default None
str_rep to pass to arithmetic_op

Returns
-------
function
"""
op_name = op.__name__.strip("_")
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
return partial(comparison_op, op=op)
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
return partial(logical_op, op=op)
else:
return partial(arithmetic_op, op=op, str_rep=str_rep)
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
DatetimeArray(arr.reshape(2, 2))
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
TimedeltaArray(arr.reshape(2, 2))
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
TimedeltaArray(arr.reshape(2, 2, 1))

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
Expand Down