modin-project · mvashishtha · Jul 8, 2022 · Jul 7, 2022 · Jul 7, 2022 · Jul 7, 2022
@@ -18,6 +18,7 @@ Key Features and Updates
   * FIX-#4593: Ensure Modin warns when setting columns via attributes (#4621)
   * FIX-#4584: Enable pdb debug when running cloud tests (#4585)
   * FIX-#4564: Workaround import issues in Ray: auto-import pandas on python start if env var is set (#4603)
+  * FIX-#4641: Reindex pandas partitions in `df.describe()` (#4651)
 * Performance enhancements
   * PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391)
   * PERF-#4288: Improve perf of `groupby.mean` for narrow data (#4591)

@@ -1577,7 +1577,7 @@ def describe(self, **kwargs):
 
         def describe_builder(df, internal_indices=[]):
             """Apply `describe` function to the subset of columns in a single partition."""
-            return df.iloc[:, internal_indices].describe(**kwargs)
+            return df.iloc[:, internal_indices].describe(**kwargs).reindex(new_index)
 
         return self.__constructor__(
             self._modin_frame.apply_full_axis_select_indices(

@@ -181,6 +181,21 @@ def test_2195(datetime_is_numeric, has_numeric_column):
     )
 
 
+# Issue: https://github.com/modin-project/modin/issues/4641
+def test_describe_column_partition_has_different_index():
+    pandas_df = pandas.DataFrame(test_data["int_data"])
+    # The index of the resulting dataframe is the same amongst all partitions
+    # when dealing with only numerical data. However, if we work with columns
+    # that contain strings, we will get extra values in our result index such as
+    # 'unique', 'top', and 'freq'. Since we call describe() on each partition,
+    # we can have cases where certain partitions do not contain any of the
+    # object string data. Thus, we add an extra string column to make sure
+    # that we are setting the index correctly for all partitions.
+    pandas_df["string_column"] = "abc"
+    modin_df = pd.DataFrame(pandas_df)
+    eval_general(modin_df, pandas_df, lambda df: df.describe(include="all"))
+
+
 @pytest.mark.parametrize(
     "exclude,include",
     [