Skip to content

Commit

Permalink
Closes #1787: Implement GroupBy.std() and GroupBy.var()
Browse files Browse the repository at this point in the history
This PR (Closes #1787):
- Adds `std` and `var` functionality to groupby aggregation
- Updates `compare_keys` in `groupby_tests` to account for NAN values

Note:
We default `ddof=1` here to follow pandas, this differs from the default of `ddof=0` in `pdarrayclass` which follows numpy
  • Loading branch information
Pierce Hayes committed Sep 16, 2022
1 parent 09c5714 commit ff9f53e
Show file tree
Hide file tree
Showing 3 changed files with 247 additions and 53 deletions.
159 changes: 151 additions & 8 deletions arkouda/groupbyclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from arkouda.client import generic_msg
from arkouda.dtypes import float64 as akfloat64
from arkouda.dtypes import int64 as akint64
from arkouda.dtypes import int_scalars
from arkouda.dtypes import uint64 as akuint64
from arkouda.infoclass import list_registry
from arkouda.logger import getArkoudaLogger
Expand Down Expand Up @@ -141,6 +142,8 @@ def unique(
class GroupByReductionType(enum.Enum):
SUM = "sum"
PROD = "prod"
VAR = "var"
STD = "std"
MEAN = "mean"
MEDIAN = "median"
MIN = "min"
Expand Down Expand Up @@ -347,7 +350,7 @@ def count(self) -> Tuple[groupable, pdarray]:
return self.unique_keys, create_pdarray(repMsg)

def aggregate(
self, values: groupable, operator: str, skipna: bool = True
self, values: groupable, operator: str, skipna: bool = True, ddof: int_scalars = 1
) -> Tuple[groupable, groupable]:
"""
Using the permutation stored in the GroupBy instance, group another
Expand All @@ -359,6 +362,10 @@ def aggregate(
The values to group and reduce
operator: str
The name of the reduction operator to use
skipna: bool
boolean which determines if NANs should be skipped
ddof : int_scalars
"Delta Degrees of Freedom" used in calculating std
Returns
-------
Expand Down Expand Up @@ -393,7 +400,6 @@ def aggregate(
-0.55555555555555558, -0.33333333333333337, -0.11111111111111116, 0.11111111111111116,
0.33333333333333326, 0.55555555555555536, 0.77777777777777768, 1]))
"""

operator = operator.lower()
if operator not in self.Reductions:
raise ValueError(f"Unsupported reduction: {operator}\nMust be one of {self.Reductions}")
Expand Down Expand Up @@ -424,6 +430,7 @@ def aggregate(
"segments": self.segments,
"op": operator,
"skip_nan": skipna,
"ddof": ddof,
},
)
self.logger.debug(repMsg)
Expand Down Expand Up @@ -451,6 +458,8 @@ def sum(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray]
The unique keys, in grouped order
group_sums : pdarray
One sum per unique key in the GroupBy instance
skipna: bool
boolean which determines if NANs should be skipped
Raises
------
Expand Down Expand Up @@ -491,6 +500,8 @@ def prod(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray
----------
values : pdarray
The values to group and multiply
skipna: bool
boolean which determines if NANs should be skipped
Returns
-------
Expand Down Expand Up @@ -530,6 +541,135 @@ def prod(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray
k, v = self.aggregate(values, "prod", skipna)
return k, cast(pdarray, v)

def var(
self, values: pdarray, skipna: bool = True, ddof: int_scalars = 1
) -> Tuple[groupable, pdarray]:
"""
Using the permutation stored in the GroupBy instance, group
another array of values and compute the variance of
each group's values.
Parameters
----------
values : pdarray
The values to group and find variance
skipna: bool
boolean which determines if NANs should be skipped
ddof : int_scalars
"Delta Degrees of Freedom" used in calculating var
Returns
-------
unique_keys : (list of) pdarray or Strings
The unique keys, in grouped order
group_vars : pdarray, float64
One var value per unique key in the GroupBy instance
Raises
------
TypeError
Raised if the values array is not a pdarray object
ValueError
Raised if the key array size does not match the values size
or if the operator is not in the GroupBy.Reductions array
Notes
-----
The return dtype is always float64.
The variance is the average of the squared deviations from the mean,
i.e., ``var = mean((x - x.mean())**2)``.
The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
If, however, `ddof` is specified, the divisor ``N - ddof`` is used
instead. In standard statistical practice, ``ddof=1`` provides an
unbiased estimator of the variance of a hypothetical infinite population.
``ddof=0`` provides a maximum likelihood estimate of the variance for
normally distributed variables.
Examples
--------
>>> a = ak.randint(1,5,10)
>>> a
array([3, 3, 4, 3, 3, 2, 3, 2, 4, 2])
>>> g = ak.GroupBy(a)
>>> g.keys
array([3, 3, 4, 3, 3, 2, 3, 2, 4, 2])
>>> b = ak.randint(1,5,10)
>>> b
array([3, 3, 3, 4, 1, 1, 3, 3, 3, 4])
>>> g.mean(b)
(array([2 3 4]), array([2.333333333333333 1.2 0]))
"""
k, v = self.aggregate(values, "var", skipna, ddof)
return k, cast(pdarray, v)

def std(
self, values: pdarray, skipna: bool = True, ddof: int_scalars = 1
) -> Tuple[groupable, pdarray]:
"""
Using the permutation stored in the GroupBy instance, group
another array of values and compute the standard deviation of
each group's values.
Parameters
----------
values : pdarray
The values to group and find standard deviation
skipna: bool
boolean which determines if NANs should be skipped
ddof : int_scalars
"Delta Degrees of Freedom" used in calculating std
Returns
-------
unique_keys : (list of) pdarray or Strings
The unique keys, in grouped order
group_stds : pdarray, float64
One std value per unique key in the GroupBy instance
Raises
------
TypeError
Raised if the values array is not a pdarray object
ValueError
Raised if the key array size does not match the values size
or if the operator is not in the GroupBy.Reductions array
Notes
-----
The return dtype is always float64.
The standard deviation is the square root of the average of the squared
deviations from the mean, i.e., ``std = sqrt(mean((x - x.mean())**2))``.
The average squared deviation is normally calculated as
``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is specified,
the divisor ``N - ddof`` is used instead. In standard statistical
practice, ``ddof=1`` provides an unbiased estimator of the variance
of the infinite population. ``ddof=0`` provides a maximum likelihood
estimate of the variance for normally distributed variables. The
standard deviation computed in this function is the square root of
the estimated variance, so even with ``ddof=1``, it will not be an
unbiased estimate of the standard deviation per se.
Examples
--------
>>> a = ak.randint(1,5,10)
>>> a
array([3, 3, 4, 3, 3, 2, 3, 2, 4, 2])
>>> g = ak.GroupBy(a)
>>> g.keys
array([3, 3, 4, 3, 3, 2, 3, 2, 4, 2])
>>> b = ak.randint(1,5,10)
>>> b
array([3, 3, 3, 4, 1, 1, 3, 3, 3, 4])
>>> g.std(b)
(array([2 3 4]), array([1.5275252316519465 1.0954451150103321 0]))
"""
k, v = self.aggregate(values, "std", skipna, ddof)
return k, cast(pdarray, v)

def mean(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray]:
"""
Using the permutation stored in the GroupBy instance, group
Expand All @@ -540,6 +680,8 @@ def mean(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray
----------
values : pdarray
The values to group and average
skipna: bool
boolean which determines if NANs should be skipped
Returns
-------
Expand Down Expand Up @@ -587,12 +729,14 @@ def median(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarr
----------
values : pdarray
The values to group and find median
skipna: bool
boolean which determines if NANs should be skipped
Returns
-------
unique_keys : (list of) pdarray or Strings
The unique keys, in grouped order
group_means : pdarray, float64
group_medians : pdarray, float64
One median value per unique key in the GroupBy instance
Raises
Expand Down Expand Up @@ -634,6 +778,8 @@ def min(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray]
----------
values : pdarray
The values to group and find minima
skipna: bool
boolean which determines if NANs should be skipped
Returns
-------
Expand Down Expand Up @@ -682,6 +828,8 @@ def max(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray]
----------
values : pdarray
The values to group and find maxima
skipna: bool
boolean which determines if NANs should be skipped
Returns
-------
Expand Down Expand Up @@ -768,7 +916,6 @@ def argmin(self, values: pdarray) -> Tuple[groupable, pdarray]:
>>> g.argmin(b)
(array([2, 3, 4]), array([5, 4, 2]))
"""

k, v = self.aggregate(values, "argmin")
return k, cast(pdarray, v)

Expand Down Expand Up @@ -818,7 +965,6 @@ def argmax(self, values: pdarray) -> Tuple[groupable, pdarray]:
>>> g.argmax(b)
(array([2, 3, 4]), array([9, 3, 2]))
"""

k, v = self.aggregate(values, "argmax")
return k, cast(pdarray, v)

Expand Down Expand Up @@ -885,7 +1031,6 @@ def nunique(self, values: groupable) -> Tuple[groupable, pdarray]:
"""
# TO DO: defer to self.aggregate once logic is ported over to Chapel
# return self.aggregate(values, "nunique")

togroup = self._nested_grouping_helper(values)
# Find unique pairs of (key, val)
g = GroupBy(togroup)
Expand Down Expand Up @@ -1393,7 +1538,6 @@ def unregister(self):
Objects registered with the server are immune to deletion until
they are unregistered.
"""

if not self.name:
raise RegistrationError(
"This item does not have a name and does not appear to be registered."
Expand Down Expand Up @@ -1516,7 +1660,6 @@ def attach(user_defined_name: str) -> GroupBy:
--------
register, is_registered, unregister, unregister_groupby_by_name
"""

from re import compile, match

from arkouda.categorical import Categorical
Expand Down
Loading

0 comments on commit ff9f53e

Please sign in to comment.