Moved comments to implementation

modin-project · Jul 8, 2022 · 49cfa17 · 49cfa17
1 parent 5669be6
commit 49cfa17
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 7 deletions.
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -1577,6 +1577,13 @@ def describe(self, **kwargs):
 
         def describe_builder(df, internal_indices=[]):
             """Apply `describe` function to the subset of columns in a single partition."""
+            # The index of the resulting dataframe is the same amongst all partitions
+            # when dealing with the same data type. However, if we work with columns
+            # that contain strings, we can get extra values in our result index such as
+            # 'unique', 'top', and 'freq'. Since we call describe() on each partition,
+            # we can have cases where certain partitions do not contain any of the
+            # object string data leading to an index mismatch between partitions.
+            # Thus, we must reindex each partition with the global new_index.
             return df.iloc[:, internal_indices].describe(**kwargs).reindex(new_index)
 
         return self.__constructor__(

diff --git a/modin/pandas/test/dataframe/test_reduce.py b/modin/pandas/test/dataframe/test_reduce.py
@@ -184,13 +184,8 @@ def test_2195(datetime_is_numeric, has_numeric_column):
 # Issue: https://github.com/modin-project/modin/issues/4641
 def test_describe_column_partition_has_different_index():
     pandas_df = pandas.DataFrame(test_data["int_data"])
-    # The index of the resulting dataframe is the same amongst all partitions
-    # when dealing with only numerical data. However, if we work with columns
-    # that contain strings, we will get extra values in our result index such as
-    # 'unique', 'top', and 'freq'. Since we call describe() on each partition,
-    # we can have cases where certain partitions do not contain any of the
-    # object string data. Thus, we add an extra string column to make sure
-    # that we are setting the index correctly for all partitions.
+    # We add a string column to test the case where partitions with mixed data
+    # types have different 'describe' rows, which causes an index mismatch.
     pandas_df["string_column"] = "abc"
     modin_df = pd.DataFrame(pandas_df)
     eval_general(modin_df, pandas_df, lambda df: df.describe(include="all"))