Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: implement scalar ops blockwise #29853

Merged
merged 20 commits into from
Dec 27, 2019
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
15f0caa
REF: implement scalar ops blockwise
jbrockmendel Nov 26, 2019
08a43f0
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Nov 27, 2019
a765069
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Nov 29, 2019
c81ea13
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 8, 2019
c2f6129
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 9, 2019
016ae64
fix missing name
jbrockmendel Dec 9, 2019
4536097
revert
jbrockmendel Dec 9, 2019
798ce75
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 20, 2019
1fc1e3e
Fix numexpr tests
jbrockmendel Dec 21, 2019
657d1bb
ADD asv
jbrockmendel Dec 21, 2019
66d34c2
remove commented-out
jbrockmendel Dec 21, 2019
0f26775
Whatsnew
jbrockmendel Dec 21, 2019
a0e4adc
blackify
jbrockmendel Dec 21, 2019
23d5c48
isort fixup
jbrockmendel Dec 21, 2019
2228f5e
remoe asv params that fail in ci
jbrockmendel Dec 21, 2019
e230cea
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 24, 2019
2f80502
comment+docstring
jbrockmendel Dec 24, 2019
31607c0
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 26, 2019
0ec7e74
Merge branch 'master' of https://github.com/pandas-dev/pandas into ba…
jbrockmendel Dec 27, 2019
cf94d13
remove unreacahble
jbrockmendel Dec 27, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions asv_bench/benchmarks/binary_ops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import operator

import numpy as np

from pandas import DataFrame, Series, date_range
Expand All @@ -9,6 +11,36 @@
import pandas.computation.expressions as expr


class IntFrameWithScalar:
params = [
[np.float64, np.int64],
[2, 3.0, np.int32(4), np.float64(5)],
[
operator.add,
operator.sub,
operator.mul,
operator.truediv,
operator.floordiv,
operator.pow,
operator.mod,
operator.eq,
operator.ne,
operator.gt,
operator.ge,
operator.lt,
operator.le,
],
]
param_names = ["dtype", "scalar", "op"]

def setup(self, dtype, scalar, op):
arr = np.random.randn(20000, 100)
self.df = DataFrame(arr.astype(dtype))

def time_frame_op_with_scalar(self, dtype, scalar, op):
op(self.df, scalar)


class Ops:

params = [[True, False], ["default", 1]]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`)
- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
- Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`)
- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
Expand Down
31 changes: 27 additions & 4 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray)
_generate_range
"""

@property
def ndim(self) -> int:
return self._data.ndim

@property
def shape(self):
return self._data.shape

def reshape(self, *args, **kwargs):
# Note: we drop any freq
data = self._data.reshape(*args, **kwargs)
return type(self)(data, dtype=self.dtype)

def ravel(self, *args, **kwargs):
# Note: we drop any freq
data = self._data.ravel(*args, **kwargs)
return type(self)(data, dtype=self.dtype)

@property
def _box_func(self):
"""
Expand Down Expand Up @@ -413,7 +431,10 @@ def __getitem__(self, key):
getitem = self._data.__getitem__
if is_int:
val = getitem(key)
return self._box_func(val)
if lib.is_scalar(val):
# i.e. self.ndim == 1
return self._box_func(val)
return type(self)(val, dtype=self.dtype)

if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
Expand Down Expand Up @@ -823,6 +844,8 @@ def inferred_freq(self):
generated by infer_freq. Returns None if it can't autodetect the
frequency.
"""
if self.ndim != 1:
return None
try:
return frequencies.infer_freq(self)
except ValueError:
Expand Down Expand Up @@ -970,7 +993,7 @@ def _add_timedeltalike_scalar(self, other):
"""
if isna(other):
# i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
new_values = np.empty(len(self), dtype="i8")
new_values = np.empty(self.shape, dtype="i8")
new_values[:] = iNaT
return new_values

Expand Down Expand Up @@ -1016,7 +1039,7 @@ def _add_nat(self):

# GH#19124 pd.NaT is treated like a timedelta for both timedelta
# and datetime dtypes
result = np.zeros(len(self), dtype=np.int64)
result = np.zeros(self.shape, dtype=np.int64)
result.fill(iNaT)
return type(self)(result, dtype=self.dtype, freq=None)

Expand All @@ -1030,7 +1053,7 @@ def _sub_nat(self):
# For datetime64 dtypes by convention we treat NaT as a datetime, so
# this subtraction returns a timedelta64 dtype.
# For period dtype, timedelta64 is a close-enough return dtype.
result = np.zeros(len(self), dtype=np.int64)
result = np.zeros(self.shape, dtype=np.int64)
result.fill(iNaT)
return result.view("timedelta64[ns]")

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
" those."
)
raise ValueError(msg)
if values.ndim != 1:
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")

if values.dtype == "i8":
Expand Down Expand Up @@ -788,6 +788,9 @@ def _sub_datetime_arraylike(self, other):
return new_values.view("timedelta64[ns]")

def _add_offset(self, offset):
if self.ndim == 2:
return self.ravel()._add_offset(offset).reshape(self.shape)

assert not isinstance(offset, Tick)
try:
if self.tz is not None:
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
" TimedeltaArray ndarray, or Series or Index containing one of those."
)
raise ValueError(msg)
if values.ndim != 1:
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")

if values.dtype == "i8":
Expand Down Expand Up @@ -1036,8 +1036,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

data = np.array(data, copy=copy)
if data.ndim != 1:
raise ValueError("Only 1-dimensional input arrays are supported.")

assert data.dtype == "m8[ns]", data
return data, inferred_freq
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,19 @@ def apply(self, func, **kwargs):
"""
with np.errstate(all="ignore"):
result = func(self.values, **kwargs)

if is_extension_array_dtype(result) and result.ndim > 1:
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
nbs = []
for i, loc in enumerate(self.mgr_locs):
vals = result[i]
nv = _block_shape(vals, ndim=self.ndim)
block = self.make_block(values=nv, placement=[loc])
nbs.append(block)
return nbs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be an elif here and re-assign to result, just to make the flow more natural. alt could make this into a method on BM. but for followon's

if not isinstance(result, Block):
# Exclude the 0-dim case so we can do reductions
result = self.make_block(values=_block_shape(result, ndim=self.ndim))

return result
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,13 @@ def _verify_integrity(self):
f"tot_items: {tot_items}"
)

def apply(self, f: str, filter=None, **kwargs):
def apply(self, f, filter=None, **kwargs):
"""
Iterate over the blocks, collect and create a new BlockManager.

Parameters
----------
f : str
f : str or callable
Name of the Block method to apply.
filter : list, if supplied, only call the block if the filter is in
the block
Expand Down Expand Up @@ -411,7 +411,10 @@ def apply(self, f: str, filter=None, **kwargs):
axis = obj._info_axis_number
kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)

applied = getattr(b, f)(**kwargs)
if callable(f):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this strictly necessary? meaning happy to require only callables here (would require some changing)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all of our existing usages pass strings here to get at Block methods. i think @WillAyd had a suggestion about re-working Block.apply to do str vs callable handling there; that should be its own PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k, yeah this whole section could use some TLC

applied = b.apply(f, **kwargs)
else:
applied = getattr(b, f)(**kwargs)
result_blocks = _extend_blocks(applied, result_blocks)

if len(result_blocks) == 0:
Expand Down Expand Up @@ -741,7 +744,7 @@ def copy(self, deep=True):

Parameters
----------
deep : boolean o rstring, default True
deep : boolean or string, default True
If False, return shallow copy (do not copy data)
If 'all', copy data and a deep copy of the index

Expand Down
7 changes: 6 additions & 1 deletion pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
arithmetic_op,
comparison_op,
define_na_arithmetic_op,
get_array_op,
logical_op,
)
from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401
Expand Down Expand Up @@ -372,6 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
right = lib.item_from_zerodim(right)
if lib.is_scalar(right) or np.ndim(right) == 0:

array_op = get_array_op(func, str_rep=str_rep)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here on what is going on

bm = left._data.apply(array_op, right=right)
return type(left)(bm)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could just be an if (as you are returning), e.g. change the following elif to an if, but NBD

def column_op(a, b):
return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}

Expand Down Expand Up @@ -713,7 +718,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
if fill_value is not None:
self = self.fillna(fill_value)

new_data = dispatch_to_series(self, other, op)
new_data = dispatch_to_series(self, other, op, str_rep)
return self._construct_result(new_data)

f.__name__ = op_name
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Functions for arithmetic and comparison operations on NumPy arrays and
ExtensionArrays.
"""
from functools import partial
import operator
from typing import Any, Union

Expand Down Expand Up @@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y):
if isinstance(y, (ABCSeries, ABCIndex)):
y = y.values

result = libops.vec_compare(x, y, op)
result = libops.vec_compare(x.ravel(), y, op)
else:
result = libops.scalar_compare(x, y, op)
return result
result = libops.scalar_compare(x.ravel(), y, op)
return result.reshape(x.shape)


def masked_arith_op(x, y, op):
Expand Down Expand Up @@ -237,9 +238,9 @@ def comparison_op(
elif is_scalar(rvalues) and isna(rvalues):
# numpy does not like comparisons vs None
if op is operator.ne:
res_values = np.ones(len(lvalues), dtype=bool)
res_values = np.ones(lvalues.shape, dtype=bool)
else:
res_values = np.zeros(len(lvalues), dtype=bool)
res_values = np.zeros(lvalues.shape, dtype=bool)

elif is_object_dtype(lvalues.dtype):
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
Expand Down Expand Up @@ -367,3 +368,13 @@ def fill_bool(x, left=None):
res_values = filler(res_values) # type: ignore

return res_values


def get_array_op(op, str_rep=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string / what this is doing

op_name = op.__name__.strip("_")
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
return partial(comparison_op, op=op)
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
return partial(logical_op, op=op)
else:
return partial(arithmetic_op, op=op, str_rep=str_rep)
8 changes: 4 additions & 4 deletions pandas/tests/arithmetic/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,18 +755,18 @@ def test_pi_sub_isub_offset(self):
rng -= pd.offsets.MonthEnd(5)
tm.assert_index_equal(rng, expected)

def test_pi_add_offset_n_gt1(self, box_transpose_fail):
def test_pi_add_offset_n_gt1(self, box_with_array):
# GH#23215
# add offset to PeriodIndex with freq.n > 1
box, transpose = box_transpose_fail
box = box_with_array

per = pd.Period("2016-01", freq="2M")
pi = pd.PeriodIndex([per])

expected = pd.PeriodIndex(["2016-03"], freq="2M")

pi = tm.box_expected(pi, box, transpose=transpose)
expected = tm.box_expected(expected, box, transpose=transpose)
pi = tm.box_expected(pi, box)
expected = tm.box_expected(expected, box)

result = pi + per.freq
tm.assert_equal(result, expected)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
DatetimeArray(arr.reshape(2, 2))
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
TimedeltaArray(arr.reshape(2, 2))
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
TimedeltaArray(arr.reshape(2, 2, 1))

with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
Expand Down