Skip to content

Commit

Permalink
FIX-#4641: Reindex pandas partitions in df.describe() (#4651)
Browse files Browse the repository at this point in the history
Signed-off-by: Karthik Velayutham <[email protected]>
  • Loading branch information
Karthik Velayutham authored Jul 8, 2022
1 parent eddfda4 commit 3578288
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.16.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Key Features and Updates
* FIX-#4593: Ensure Modin warns when setting columns via attributes (#4621)
* FIX-#4584: Enable pdb debug when running cloud tests (#4585)
* FIX-#4564: Workaround import issues in Ray: auto-import pandas on python start if env var is set (#4603)
* FIX-#4641: Reindex pandas partitions in `df.describe()` (#4651)
* Performance enhancements
* PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391)
* PERF-#4288: Improve perf of `groupby.mean` for narrow data (#4591)
Expand Down
9 changes: 8 additions & 1 deletion modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1577,7 +1577,14 @@ def describe(self, **kwargs):

def describe_builder(df, internal_indices=[]):
"""Apply `describe` function to the subset of columns in a single partition."""
return df.iloc[:, internal_indices].describe(**kwargs)
# The index of the resulting dataframe is the same amongst all partitions
# when dealing with the same data type. However, if we work with columns
# that contain strings, we can get extra values in our result index such as
# 'unique', 'top', and 'freq'. Since we call describe() on each partition,
# we can have cases where certain partitions do not contain any of the
# object string data leading to an index mismatch between partitions.
# Thus, we must reindex each partition with the global new_index.
return df.iloc[:, internal_indices].describe(**kwargs).reindex(new_index)

return self.__constructor__(
self._modin_frame.apply_full_axis_select_indices(
Expand Down
10 changes: 10 additions & 0 deletions modin/pandas/test/dataframe/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,16 @@ def test_2195(datetime_is_numeric, has_numeric_column):
)


# Issue: https://github.com/modin-project/modin/issues/4641
def test_describe_column_partition_has_different_index():
pandas_df = pandas.DataFrame(test_data["int_data"])
# We add a string column to test the case where partitions with mixed data
# types have different 'describe' rows, which causes an index mismatch.
pandas_df["string_column"] = "abc"
modin_df = pd.DataFrame(pandas_df)
eval_general(modin_df, pandas_df, lambda df: df.describe(include="all"))


@pytest.mark.parametrize(
"exclude,include",
[
Expand Down

0 comments on commit 3578288

Please sign in to comment.