From 4056ed282f38aa97d17e1b198b44fd7c29a140c5 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 3 Jan 2019 09:32:21 -0800 Subject: [PATCH 1/6] change BlockManager.reduction to quantile, simplify it a bunch --- pandas/core/internals/blocks.py | 140 ++++++++++++++---------------- pandas/core/internals/managers.py | 30 ++++--- 2 files changed, 83 insertions(+), 87 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d12114bd951ba..5e7e7cb050c64 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1448,101 +1448,53 @@ def quantile(self, qs, interpolation='linear', axis=0, axes=None): qs: a scalar or list of the quantiles to be computed interpolation: type of interpolation, default 'linear' axis: axis to compute, default 0 - axes : BlockManager.axes Returns ------- - tuple of (axis, block) - + Block """ kw = {'interpolation': interpolation} values = self.get_values() values, _ = self._try_coerce_args(values, values) - def _nanpercentile1D(values, mask, q, **kw): - # mask is Union[ExtensionArray, ndarray] - values = values[~mask] - - if len(values) == 0: - if lib.is_scalar(q): - return self._na_value - else: - return np.array([self._na_value] * len(q), - dtype=values.dtype) - - return np.percentile(values, q, **kw) - - def _nanpercentile(values, q, axis, **kw): - - mask = isna(self.values) - if not lib.is_scalar(mask) and mask.any(): - if self.ndim == 1: - return _nanpercentile1D(values, mask, q, **kw) - else: - # for nonconsolidatable blocks mask is 1D, but values 2D - if mask.ndim < values.ndim: - mask = mask.reshape(values.shape) - if axis == 0: - values = values.T - mask = mask.T - result = [_nanpercentile1D(val, m, q, **kw) for (val, m) - in zip(list(values), list(mask))] - result = np.array(result, dtype=values.dtype, copy=False).T - return result - else: - return np.percentile(values, q, axis=axis, **kw) - - from pandas import Float64Index is_empty = values.shape[axis] == 0 - if is_list_like(qs): - ax = Float64Index(qs) + orig_scalar = not is_list_like(qs) + if orig_scalar: + # make list-like, unpack later + qs = [qs] - if is_empty: - if self.ndim == 1: - result = self._na_value - else: - # create the array of na_values - # 2d len(values) * len(qs) - result = np.repeat(np.array([self._na_value] * len(qs)), - len(values)).reshape(len(values), - len(qs)) + if is_empty: + if self.ndim == 1: + result = self._na_value else: - - try: - result = _nanpercentile(values, np.array(qs) * 100, - axis=axis, **kw) - except ValueError: - - # older numpies don't handle an array for q - result = [_nanpercentile(values, q * 100, - axis=axis, **kw) for q in qs] - - result = np.array(result, copy=False) - if self.ndim > 1: - result = result.T - + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self._na_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: + mask = isna(self.values) + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, na_value=self._na_value, + mask=mask, ndim=self.ndim, **kw) - if self.ndim == 1: - ax = Float64Index([qs]) - else: - ax = axes[0] + result = np.array(result, copy=False) + if self.ndim > 1: + result = result.T - if is_empty: - if self.ndim == 1: - result = self._na_value - else: - result = np.array([self._na_value] * len(self)) - else: - result = _nanpercentile(values, qs * 100, axis=axis, **kw) + if orig_scalar and not lib.is_scalar(result): + # result could be scalar in case with is_empty and self.ndim == 1 + assert result.shape[-1] == 1, result.shape + result = result[..., 0] + result = lib.item_from_zerodim(result) ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) if lib.is_scalar(result): - return ax, self.make_block_scalar(result) - return ax, make_block(result, - placement=np.arange(len(result)), - ndim=ndim) + return self.make_block_scalar(result) + return make_block(result, + placement=np.arange(len(result)), + ndim=ndim) def _replace_coerce(self, to_replace, value, inplace=True, regex=False, convert=False, mask=None): @@ -3421,3 +3373,37 @@ def _putmask_preserve(nv, n): v = v.astype(dtype) return _putmask_preserve(v, n) + + +# TODO: belongs elsewhere? +def _nanpercentile1D(values, mask, q, na_value, **kw): + # mask is Union[ExtensionArray, ndarray] + values = values[~mask] + + if len(values) == 0: + if lib.is_scalar(q): + return na_value + else: + return np.array([na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, **kw) + + +def _nanpercentile(values, q, axis, na_value, mask, ndim, **kw): + if not lib.is_scalar(mask) and mask.any(): + if ndim == 1: + return _nanpercentile1D(values, mask, q, na_value, **kw) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, na_value, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, **kw) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index eba49d18431ef..0ad0a994e8a95 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -16,7 +16,7 @@ maybe_promote) from pandas.core.dtypes.common import ( _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, - is_extension_type, is_numeric_v_string_like, is_scalar) + is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries from pandas.core.dtypes.missing import isna @@ -402,34 +402,47 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, bm._consolidate_inplace() return bm - def reduction(self, f, axis=0, consolidate=True, transposed=False, - **kwargs): + def quantile(self, axis=0, consolidate=True, transposed=False, + interpolation='linear', qs=None, numeric_only=None): """ - iterate over the blocks, collect and create a new block manager. + Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and will do inference on the generated blocks. Parameters ---------- - f: the callable or function name to operate on at the block level axis: reduction axis, default 0 consolidate: boolean, default True. Join together blocks having same dtype transposed: boolean, default False we are holding transposed data + interpolation : type of interpolation, default 'linear' + qs : a scalar or list of the quantiles to be computed + numeric_only : ignored Returns ------- Block Manager (new object) - """ if consolidate: self._consolidate_inplace() + def get_axe(block, qs, axes): + from pandas import Float64Index + if is_list_like(qs): + ax = Float64Index(qs) + elif block.ndim == 1: + ax = Float64Index([qs]) + else: + ax = axes[0] + return ax + axes, blocks = [], [] for b in self.blocks: - axe, block = getattr(b, f)(axis=axis, axes=self.axes, **kwargs) + block = b.quantile(axis=axis, qs=qs, interpolation=interpolation) + + axe = get_axe(b, qs, axes=self.axes) axes.append(axe) blocks.append(block) @@ -496,9 +509,6 @@ def isna(self, func, **kwargs): def where(self, **kwargs): return self.apply('where', **kwargs) - def quantile(self, **kwargs): - return self.reduction('quantile', **kwargs) - def setitem(self, **kwargs): return self.apply('setitem', **kwargs) From 30543bfdc97131a123f0a8256bd750741b1ab3ee Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 3 Jan 2019 10:23:41 -0800 Subject: [PATCH 2/6] remove unused kwarg --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 57b96595bbfa9..05e9ab03735bc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1439,7 +1439,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): blocks = [make_block(new_values, placement=new_placement)] return blocks, mask - def quantile(self, qs, interpolation='linear', axis=0, axes=None): + def quantile(self, qs, interpolation='linear', axis=0): """ compute the quantiles of the From d26f7731686e258aa0fefd4eacb257aa9699f9d8 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 3 Jan 2019 10:25:15 -0800 Subject: [PATCH 3/6] make **kw explicit --- pandas/core/internals/blocks.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 05e9ab03735bc..1f504386ff67e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1453,7 +1453,6 @@ def quantile(self, qs, interpolation='linear', axis=0): ------- Block """ - kw = {'interpolation': interpolation} values = self.get_values() values, _ = self._try_coerce_args(values, values) @@ -1476,7 +1475,8 @@ def quantile(self, qs, interpolation='linear', axis=0): mask = isna(self.values) result = _nanpercentile(values, np.array(qs) * 100, axis=axis, na_value=self._na_value, - mask=mask, ndim=self.ndim, **kw) + mask=mask, ndim=self.ndim, + interpolation=interpolation) result = np.array(result, copy=False) if self.ndim > 1: @@ -3327,7 +3327,7 @@ def _putmask_preserve(nv, n): # TODO: belongs elsewhere? -def _nanpercentile1D(values, mask, q, na_value, **kw): +def _nanpercentile1D(values, mask, q, na_value, interpolation): # mask is Union[ExtensionArray, ndarray] values = values[~mask] @@ -3338,13 +3338,14 @@ def _nanpercentile1D(values, mask, q, na_value, **kw): return np.array([na_value] * len(q), dtype=values.dtype) - return np.percentile(values, q, **kw) + return np.percentile(values, q, interpolation=interpolation) -def _nanpercentile(values, q, axis, na_value, mask, ndim, **kw): +def _nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): if not lib.is_scalar(mask) and mask.any(): if ndim == 1: - return _nanpercentile1D(values, mask, q, na_value, **kw) + return _nanpercentile1D(values, mask, q, na_value, + interpolation=interpolation) else: # for nonconsolidatable blocks mask is 1D, but values 2D if mask.ndim < values.ndim: @@ -3352,9 +3353,10 @@ def _nanpercentile(values, q, axis, na_value, mask, ndim, **kw): if axis == 0: values = values.T mask = mask.T - result = [_nanpercentile1D(val, m, q, na_value, **kw) for (val, m) - in zip(list(values), list(mask))] + result = [_nanpercentile1D(val, m, q, na_value, + interpolation=interpolation) + for (val, m) in zip(list(values), list(mask))] result = np.array(result, dtype=values.dtype, copy=False).T return result else: - return np.percentile(values, q, axis=axis, **kw) + return np.percentile(values, q, axis=axis, interpolation=interpolation) From 379fdde377d14cf18dbef590284e5b9db03a1d17 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 3 Jan 2019 10:28:02 -0800 Subject: [PATCH 4/6] move nanpercentile to nanops --- pandas/core/internals/blocks.py | 45 ++++----------------------------- pandas/core/nanops.py | 35 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1f504386ff67e..c5a1ec77012f1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -43,6 +43,7 @@ from pandas.core.indexing import check_setitem_lengths from pandas.core.internals.arrays import extract_array import pandas.core.missing as missing +from pandas.core.nanops import nanpercentile from pandas.io.formats.printing import pprint_thing @@ -1473,10 +1474,10 @@ def quantile(self, qs, interpolation='linear', axis=0): len(qs)) else: mask = isna(self.values) - result = _nanpercentile(values, np.array(qs) * 100, - axis=axis, na_value=self._na_value, - mask=mask, ndim=self.ndim, - interpolation=interpolation) + result = nanpercentile(values, np.array(qs) * 100, + axis=axis, na_value=self._na_value, + mask=mask, ndim=self.ndim, + interpolation=interpolation) result = np.array(result, copy=False) if self.ndim > 1: @@ -3324,39 +3325,3 @@ def _putmask_preserve(nv, n): v = v.astype(dtype) return _putmask_preserve(v, n) - - -# TODO: belongs elsewhere? -def _nanpercentile1D(values, mask, q, na_value, interpolation): - # mask is Union[ExtensionArray, ndarray] - values = values[~mask] - - if len(values) == 0: - if lib.is_scalar(q): - return na_value - else: - return np.array([na_value] * len(q), - dtype=values.dtype) - - return np.percentile(values, q, interpolation=interpolation) - - -def _nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): - if not lib.is_scalar(mask) and mask.any(): - if ndim == 1: - return _nanpercentile1D(values, mask, q, na_value, - interpolation=interpolation) - else: - # for nonconsolidatable blocks mask is 1D, but values 2D - if mask.ndim < values.ndim: - mask = mask.reshape(values.shape) - if axis == 0: - values = values.T - mask = mask.T - result = [_nanpercentile1D(val, m, q, na_value, - interpolation=interpolation) - for (val, m) in zip(list(values), list(mask))] - result = np.array(result, dtype=values.dtype, copy=False).T - return result - else: - return np.percentile(values, q, axis=axis, interpolation=interpolation) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f95c133163ddb..c065865604158 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1194,3 +1194,38 @@ def f(x, y): nanle = make_nancomp(operator.le) naneq = make_nancomp(operator.eq) nanne = make_nancomp(operator.ne) + + +def _nanpercentile1D(values, mask, q, na_value, interpolation): + # mask is Union[ExtensionArray, ndarray] + values = values[~mask] + + if len(values) == 0: + if lib.is_scalar(q): + return na_value + else: + return np.array([na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, interpolation=interpolation) + + +def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): + if not lib.is_scalar(mask) and mask.any(): + if ndim == 1: + return _nanpercentile1D(values, mask, q, na_value, + interpolation=interpolation) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, na_value, + interpolation=interpolation) + for (val, m) in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, interpolation=interpolation) From 319a77b4de16cac591d49b7eeafb665b48f83f0c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 3 Jan 2019 11:46:13 -0800 Subject: [PATCH 5/6] docstrings --- pandas/core/nanops.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c065865604158..5a87ccc1c2179 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1197,6 +1197,24 @@ def f(x, y): def _nanpercentile1D(values, mask, q, na_value, interpolation): + """ + Wraper for np.percentile that skips missing values, specialized to + 1-dimensional case. + + Parameters + ---------- + values : array over which to find quantiles + mask : ndarray[bool] + locations in values that should be considered missing + q : scalar or array of quantile indices to find + na_value : scalar + value to return for empty or all-null values + interpolation : str + + Returns + ------- + quantiles : scalar or array + """ # mask is Union[ExtensionArray, ndarray] values = values[~mask] @@ -1211,6 +1229,25 @@ def _nanpercentile1D(values, mask, q, na_value, interpolation): def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): + """ + Wraper for np.percentile that skips missing values. + + Parameters + ---------- + values : array over which to find quantiles + q : scalar or array of quantile indices to find + axis : {0, 1} + na_value : scalar + value to return for empty or all-null values + mask : ndarray[bool] + locations in values that should be considered missing + ndim : {1, 2} + interpolation : str + + Returns + ------- + quantiles : scalar or array + """ if not lib.is_scalar(mask) and mask.any(): if ndim == 1: return _nanpercentile1D(values, mask, q, na_value, From 11af1ddd1f204208cc6f0290a6329647cec92e92 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 3 Jan 2019 14:03:52 -0800 Subject: [PATCH 6/6] rename --- pandas/core/nanops.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 5a87ccc1c2179..89e191f171f97 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1196,7 +1196,7 @@ def f(x, y): nanne = make_nancomp(operator.ne) -def _nanpercentile1D(values, mask, q, na_value, interpolation): +def _nanpercentile_1d(values, mask, q, na_value, interpolation): """ Wraper for np.percentile that skips missing values, specialized to 1-dimensional case. @@ -1250,8 +1250,8 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): """ if not lib.is_scalar(mask) and mask.any(): if ndim == 1: - return _nanpercentile1D(values, mask, q, na_value, - interpolation=interpolation) + return _nanpercentile_1d(values, mask, q, na_value, + interpolation=interpolation) else: # for nonconsolidatable blocks mask is 1D, but values 2D if mask.ndim < values.ndim: @@ -1259,8 +1259,8 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): if axis == 0: values = values.T mask = mask.T - result = [_nanpercentile1D(val, m, q, na_value, - interpolation=interpolation) + result = [_nanpercentile_1d(val, m, q, na_value, + interpolation=interpolation) for (val, m) in zip(list(values), list(mask))] result = np.array(result, dtype=values.dtype, copy=False).T return result