Skip to content

Commit

Permalink
FIX-modin-project#1953: Fix computing of reduced indices
Browse files Browse the repository at this point in the history
for reduction operation

Signed-off-by: Igoshev, Yaroslav <[email protected]>
  • Loading branch information
YarShev committed Sep 2, 2020
1 parent ff6ff0d commit 8fec8a3
Show file tree
Hide file tree
Showing 7 changed files with 800 additions and 231 deletions.
27 changes: 19 additions & 8 deletions modin/engines/base/frame/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ def __init__(
)
self._column_widths_cache = column_widths
self._dtypes = dtypes
self._filter_empties()
if validate_axes is not False:
self._validate_internal_indices(mode=validate_axes)
self._filter_empties()

@property
def _row_lengths(self):
Expand Down Expand Up @@ -284,6 +284,11 @@ def _validate_axis_equality(self, axis: int, force: bool = False):
is_lenghts_matches = len(self.axes[axis]) == len(internal_axis)
if not is_equals:
if force:
if not is_lenghts_matches:
if axis:
self._column_widths_cache = None
else:
self._row_lengths_cache = None
new_axis = self.axes[axis] if is_lenghts_matches else internal_axis
self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
else:
Expand Down Expand Up @@ -336,9 +341,9 @@ def _validate_internal_indices(self, mode=None, **kwargs):
args = args_dict.get(mode, args_dict["custom"])

if args.get("validate_index", True):
self._validate_axis_equality(axis=0)
self._validate_axis_equality(axis=0, force=args.get("force"))
if args.get("validate_columns", True):
self._validate_axis_equality(axis=1)
self._validate_axis_equality(axis=1, force=args.get("force"))

def _apply_index_objs(self, axis=None):
"""Lazily applies the index object (Index or Columns) to the partitions.
Expand Down Expand Up @@ -1000,13 +1005,19 @@ def _compute_map_reduce_metadata(self, axis, new_parts):
)

def _fold_reduce(self, axis, func):
"""Applies map that reduce Manager to series but require knowledge of full axis.
"""
Apply function that reduce Manager to series but require knowledge of full axis.
Args:
func: Function to reduce the Manager by. This function takes in a Manager.
axis: axis to apply the function to.
Parameters
----------
axis : 0 or 1
The axis to apply the function to (0 - index, 1 - columns).
func : callable
The function to reduce the Manager by. This function takes in a Manager.
Return:
Returns
-------
BasePandasFrame
Pandas series containing the reduced data.
"""
func = self._build_mapreduce_func(axis, func)
Expand Down
193 changes: 0 additions & 193 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1655,29 +1655,6 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
)
)

def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Computes median across the DataFrame.
Args:
axis (int): The axis to take the median on.
skipna (bool): True to skip NA values, false otherwise.
Returns:
The median of the DataFrame. (Pandas series)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.median(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)

def memory_usage(self, index=True, deep=False):
"""Returns the memory usage of each column in bytes
Expand Down Expand Up @@ -1862,52 +1839,6 @@ def pow(self, other, axis="columns", level=None, fill_value=None):
"pow", other, axis=axis, level=level, fill_value=fill_value
)

def prod(
self,
axis=None,
skipna=None,
level=None,
numeric_only=None,
min_count=0,
**kwargs,
):
"""Return the product of the values for the requested axis
Args:
axis : {index (0), columns (1)}
skipna : boolean, default True
level : int or level name, default None
numeric_only : boolean, default None
min_count : int, default 0
Returns:
prod : Series or DataFrame (if level specified)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
if min_count > 1:
return data._reduce_dimension(
query_compiler=data._query_compiler.prod_min_count(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)
return data._reduce_dimension(
data._query_compiler.prod(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)

product = prod
radd = add

def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
Expand Down Expand Up @@ -2733,32 +2664,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
else:
return self.tshift(periods, freq)

def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Return unbiased skew over requested axis Normalized by N-1
Args:
axis : {index (0), columns (1)}
skipna : boolean, default True
Exclude NA/null values when computing the result.
level : int or level name, default None
numeric_only : boolean, default None
Returns:
skew : Series or DataFrame (if level specified)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.skew(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)

def sort_index(
self,
axis=0,
Expand Down Expand Up @@ -2842,33 +2747,6 @@ def sort_values(
)
return self._create_or_update_from_compiler(result, inplace)

def std(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""Computes standard deviation across the DataFrame.
Args:
axis (int): The axis to take the std on.
skipna (bool): True to skip NA values, false otherwise.
ddof (int): degrees of freedom
Returns:
The std of the DataFrame (Pandas Series)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.std(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def sub(self, other, axis="columns", level=None, fill_value=None):
"""Subtract a DataFrame/Series/scalar from this DataFrame.
Expand All @@ -2887,50 +2765,6 @@ def sub(self, other, axis="columns", level=None, fill_value=None):

subtract = sub

def sum(
self,
axis=None,
skipna=None,
level=None,
numeric_only=None,
min_count=0,
**kwargs,
):
"""Perform a sum across the DataFrame.
Args:
axis (int): The axis to sum on.
skipna (bool): True to skip NA values, false otherwise.
Returns:
The sum of the DataFrame.
"""
axis = self._get_axis_number(axis) if axis is not None else 0
data = self._validate_dtypes_sum_prod_mean(
axis, numeric_only, ignore_axis=False
)
if min_count > 1:
return data._reduce_dimension(
query_compiler=data._query_compiler.sum_min_count(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)
return data._reduce_dimension(
data._query_compiler.sum(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)

def swapaxes(self, axis1, axis2, copy=True):
axis1 = self._get_axis_number(axis1)
axis2 = self._get_axis_number(axis2)
Expand Down Expand Up @@ -3333,33 +3167,6 @@ def tz_localize(
)
return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)

def var(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""Computes variance across the DataFrame.
Args:
axis (int): The axis to take the variance on.
skipna (bool): True to skip NA values, false otherwise.
ddof (int): degrees of freedom
Returns:
The variance of the DataFrame.
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.var(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def __abs__(self):
"""Creates a modified DataFrame by taking the absolute value.
Expand Down
Loading

0 comments on commit 8fec8a3

Please sign in to comment.