Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Simplify quantile, remove reduction from BlockManager #24597

Merged
merged 8 commits into from
Jan 3, 2019
104 changes: 32 additions & 72 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from pandas.core.indexing import check_setitem_lengths
from pandas.core.internals.arrays import extract_array
import pandas.core.missing as missing
from pandas.core.nanops import nanpercentile

from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -1439,7 +1440,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask

def quantile(self, qs, interpolation='linear', axis=0, axes=None):
def quantile(self, qs, interpolation='linear', axis=0):
"""
compute the quantiles of the

Expand All @@ -1448,94 +1449,53 @@ def quantile(self, qs, interpolation='linear', axis=0, axes=None):
qs: a scalar or list of the quantiles to be computed
interpolation: type of interpolation, default 'linear'
axis: axis to compute, default 0
axes : BlockManager.axes

Returns
-------
tuple of (axis, block)

Block
"""
kw = {'interpolation': interpolation}
values = self.get_values()
values, _ = self._try_coerce_args(values, values)

def _nanpercentile1D(values, mask, q, **kw):
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]

if len(values) == 0:
if lib.is_scalar(q):
return self._na_value
else:
return np.array([self._na_value] * len(q),
dtype=values.dtype)

return np.percentile(values, q, **kw)

def _nanpercentile(values, q, axis, **kw):

mask = isna(self.values)
if not lib.is_scalar(mask) and mask.any():
if self.ndim == 1:
return _nanpercentile1D(values, mask, q, **kw)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
in zip(list(values), list(mask))]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, **kw)

from pandas import Float64Index
is_empty = values.shape[axis] == 0
if is_list_like(qs):
ax = Float64Index(qs)
orig_scalar = not is_list_like(qs)
if orig_scalar:
# make list-like, unpack later
qs = [qs]

if is_empty:
if self.ndim == 1:
result = self._na_value
else:
# create the array of na_values
# 2d len(values) * len(qs)
result = np.repeat(np.array([self._na_value] * len(qs)),
len(values)).reshape(len(values),
len(qs))
if is_empty:
if self.ndim == 1:
result = self._na_value
else:
result = _nanpercentile(values, np.array(qs) * 100,
axis=axis, **kw)

result = np.array(result, copy=False)
if self.ndim > 1:
result = result.T

# create the array of na_values
# 2d len(values) * len(qs)
result = np.repeat(np.array([self._na_value] * len(qs)),
len(values)).reshape(len(values),
len(qs))
else:
mask = isna(self.values)
result = nanpercentile(values, np.array(qs) * 100,
axis=axis, na_value=self._na_value,
mask=mask, ndim=self.ndim,
interpolation=interpolation)

if self.ndim == 1:
ax = Float64Index([qs])
else:
ax = axes[0]
result = np.array(result, copy=False)
if self.ndim > 1:
result = result.T

if is_empty:
if self.ndim == 1:
result = self._na_value
else:
result = np.array([self._na_value] * len(self))
else:
result = _nanpercentile(values, qs * 100, axis=axis, **kw)
if orig_scalar and not lib.is_scalar(result):
# result could be scalar in case with is_empty and self.ndim == 1
assert result.shape[-1] == 1, result.shape
result = result[..., 0]
result = lib.item_from_zerodim(result)

ndim = getattr(result, 'ndim', None) or 0
result = self._try_coerce_result(result)
if lib.is_scalar(result):
return ax, self.make_block_scalar(result)
return ax, make_block(result,
placement=np.arange(len(result)),
ndim=ndim)
return self.make_block_scalar(result)
return make_block(result,
placement=np.arange(len(result)),
ndim=ndim)

def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
convert=False, mask=None):
Expand Down
30 changes: 20 additions & 10 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
maybe_promote)
from pandas.core.dtypes.common import (
_NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype,
is_extension_type, is_numeric_v_string_like, is_scalar)
is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -402,34 +402,47 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
bm._consolidate_inplace()
return bm

def reduction(self, f, axis=0, consolidate=True, transposed=False,
**kwargs):
def quantile(self, axis=0, consolidate=True, transposed=False,
interpolation='linear', qs=None, numeric_only=None):
"""
iterate over the blocks, collect and create a new block manager.
Iterate over blocks applying quantile reduction.
This routine is intended for reduction type operations and
will do inference on the generated blocks.

Parameters
----------
f: the callable or function name to operate on at the block level
axis: reduction axis, default 0
consolidate: boolean, default True. Join together blocks having same
dtype
transposed: boolean, default False
we are holding transposed data
interpolation : type of interpolation, default 'linear'
qs : a scalar or list of the quantiles to be computed
numeric_only : ignored

Returns
-------
Block Manager (new object)

"""

if consolidate:
self._consolidate_inplace()

def get_axe(block, qs, axes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think this adds anything to make it a function

from pandas import Float64Index
if is_list_like(qs):
ax = Float64Index(qs)
elif block.ndim == 1:
ax = Float64Index([qs])
else:
ax = axes[0]
return ax

axes, blocks = [], []
for b in self.blocks:
axe, block = getattr(b, f)(axis=axis, axes=self.axes, **kwargs)
block = b.quantile(axis=axis, qs=qs, interpolation=interpolation)

axe = get_axe(b, qs, axes=self.axes)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also doesn't need / take the bock arg

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it uses the block arg

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no i mean get_axe doens't

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(which is why it is a function instead of just done once outside the loop).

I'd rather keep it as a function than in-line it, but not a deal-breaker. There is another PR after this that will be ripping out a bunch of code regardless.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no i mean get_axe doens't

line 435 inside get_axe reads elif block.ndim == 1:

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

grr, ok, i c now


axes.append(axe)
blocks.append(block)
Expand Down Expand Up @@ -496,9 +509,6 @@ def isna(self, func, **kwargs):
def where(self, **kwargs):
return self.apply('where', **kwargs)

def quantile(self, **kwargs):
return self.reduction('quantile', **kwargs)

def setitem(self, **kwargs):
return self.apply('setitem', **kwargs)

Expand Down
72 changes: 72 additions & 0 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,3 +1194,75 @@ def f(x, y):
nanle = make_nancomp(operator.le)
naneq = make_nancomp(operator.eq)
nanne = make_nancomp(operator.ne)


def _nanpercentile1D(values, mask, q, na_value, interpolation):
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
"""
Wraper for np.percentile that skips missing values, specialized to
1-dimensional case.

Parameters
----------
values : array over which to find quantiles
mask : ndarray[bool]
locations in values that should be considered missing
q : scalar or array of quantile indices to find
na_value : scalar
value to return for empty or all-null values
interpolation : str

Returns
-------
quantiles : scalar or array
"""
# mask is Union[ExtensionArray, ndarray]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add doc-strings

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just pushed with docstrings. We're going to simplify the tar out of these methods if/when #24600 gets fixed.

values = values[~mask]

if len(values) == 0:
if lib.is_scalar(q):
return na_value
else:
return np.array([na_value] * len(q),
dtype=values.dtype)

return np.percentile(values, q, interpolation=interpolation)


def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
"""
Wraper for np.percentile that skips missing values.

Parameters
----------
values : array over which to find quantiles
q : scalar or array of quantile indices to find
axis : {0, 1}
na_value : scalar
value to return for empty or all-null values
mask : ndarray[bool]
locations in values that should be considered missing
ndim : {1, 2}
interpolation : str

Returns
-------
quantiles : scalar or array
"""
if not lib.is_scalar(mask) and mask.any():
if ndim == 1:
return _nanpercentile1D(values, mask, q, na_value,
interpolation=interpolation)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [_nanpercentile1D(val, m, q, na_value,
interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, interpolation=interpolation)