TEST-modin-project#2670: some updates to improve asv tests stability

Signed-off-by: Anatoly Myachev <[email protected]>
anmyachev · Feb 3, 2021 · 3a1a055 · 3a1a055
1 parent e25a5e0
commit 3a1a055
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 51 deletions.
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -74,30 +74,32 @@ def execute(df):
 
 
 class BaseTimeGroupBy:
-    def setup(self, data_size, ncols=1):
-        self.df = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+    def setup(self, data_size, groupby_ncols=1):
+        count_cols = data_size[1] - groupby_ncols
+        count_groups = 100
+        self.df, self.groupby_columns = generate_dataframe(
+            ASV_USE_IMPL,
+            "int",
+            data_size[0],
+            count_cols,
+            RAND_LOW,
+            RAND_HIGH,
+            groupby_ncols,
+            count_groups,
         )
-        self.groupby_columns = self.df.columns[:ncols].tolist()
 
 
-class TimeMultiColumnGroupby(BaseTimeGroupBy):
-    param_names = ["data_size", "ncols"]
+class TimeGroupByMultiColumn(BaseTimeGroupBy):
+    param_names = ["data_size", "groupby_ncols"]
     params = [
-        [
-            (5000, 5000),
-            (10_000, 10),
-            # TODO: after optimization try to use UNARY_OP_DATA_SIZE here
-        ]
-        if ASV_DATASET_SIZE == "Big"
-        else UNARY_OP_DATA_SIZE,
+        UNARY_OP_DATA_SIZE,
         [6],
     ]
 
-    def time_groupby_agg_quan(self, data_size, ncols):
+    def time_groupby_agg_quan(self, data_size, groupby_ncols):
         execute(self.df.groupby(by=self.groupby_columns).agg("quantile"))
 
-    def time_groupby_agg_mean(self, data_size, ncols):
+    def time_groupby_agg_mean(self, data_size, groupby_ncols):
         execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean()))
 
 
@@ -150,44 +152,40 @@ class TimeJoin:
 
     def setup(self, data_size, how, sort):
         self.df1 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
         self.df2 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
         )
 
     def time_join(self, data_size, how, sort):
-        execute(
-            self.df1.join(
-                self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
-            )
-        )
+        # join dataframes on index to get the predictable shape
+        execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))
 
 
 class TimeMerge:
     param_names = ["data_size", "how", "sort"]
     params = [
-        [
-            (5000, 5000, 5000, 5000),
-            (125_000, 15, 100_000, 10),
-            # TODO: after optimization try to use BINARY_OP_DATA_SIZE here
-        ]
-        if ASV_DATASET_SIZE == "Big"
-        else BINARY_OP_DATA_SIZE,
+        BINARY_OP_DATA_SIZE,
         ["left", "inner"],
         [False],
     ]
 
     def setup(self, data_size, how, sort):
         self.df1 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
         self.df2 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
         )
 
     def time_merge(self, data_size, how, sort):
-        execute(self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort))
+        # merge dataframes by index to get the predictable shape
+        execute(
+            self.df1.merge(
+                self.df2, left_index=True, right_index=True, how=how, sort=sort
+            )
+        )
 
 
 class TimeConcat:
@@ -199,12 +197,11 @@ class TimeConcat:
     ]
 
     def setup(self, data_size, how, axis):
-        # shape for generate_dataframe: first - ncols, second - nrows
         self.df1 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
         self.df2 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
         )
 
     def time_concat(self, data_size, how, axis):
@@ -225,12 +222,11 @@ class TimeBinaryOp:
     ]
 
     def setup(self, data_size, binary_op, axis):
-        # shape for generate_dataframe: first - ncols, second - nrows
         self.df1 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
         self.df2 = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
         )
         self.op = getattr(self.df1, binary_op)
 
@@ -260,7 +256,7 @@ def get_loc(df, loc, axis, item_length):
 
     def setup(self, data_size, item_length, loc, is_equal_indices):
         self.df = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         ).copy()
         self.loc, self.iloc = self.get_loc(
             self.df, loc, item_length=item_length, axis=1
@@ -315,7 +311,7 @@ class TimeArithmetic:
 
     def setup(self, data_size, axis):
         self.df = generate_dataframe(
-            ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
+            ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
         )
 
     def time_sum(self, data_size, axis):

diff --git a/asv_bench/benchmarks/utils.py b/asv_bench/benchmarks/utils.py
@@ -30,8 +30,8 @@ class weakdict(dict):
 dataframes_cache = dict()
 
 
-def gen_int_data(ncols, nrows, rand_low, rand_high):
-    cache_key = ("int", ncols, nrows, rand_low, rand_high)
+def gen_int_data(nrows, ncols, rand_low, rand_high):
+    cache_key = ("int", nrows, ncols, rand_low, rand_high)
     if cache_key in data_cache:
         return data_cache[cache_key]
 
@@ -48,8 +48,8 @@ def gen_int_data(ncols, nrows, rand_low, rand_high):
     return data
 
 
-def gen_str_int_data(ncols, nrows, rand_low, rand_high):
-    cache_key = ("str_int", ncols, nrows, rand_low, rand_high)
+def gen_str_int_data(nrows, ncols, rand_low, rand_high):
+    cache_key = ("str_int", nrows, ncols, rand_low, rand_high)
     if cache_key in data_cache:
         return data_cache[cache_key]
 
@@ -58,25 +58,47 @@ def gen_str_int_data(ncols, nrows, rand_low, rand_high):
             nrows, ncols, rand_low, rand_high
         )
     )
-    data = gen_int_data(ncols, nrows, rand_low, rand_high).copy()
+    data = gen_int_data(nrows, ncols, rand_low, rand_high).copy()
     data["gb_col"] = [
         "str_{}".format(random_state.randint(rand_low, rand_high)) for i in range(nrows)
     ]
     data_cache[cache_key] = weakdict(data)
     return data
 
 
-def gen_data(data_type, ncols, nrows, rand_low, rand_high):
+def gen_data(data_type, nrows, ncols, rand_low, rand_high):
     if data_type == "int":
-        return gen_int_data(ncols, nrows, rand_low, rand_high)
+        return gen_int_data(nrows, ncols, rand_low, rand_high)
     elif data_type == "str_int":
-        return gen_str_int_data(ncols, nrows, rand_low, rand_high)
+        return gen_str_int_data(nrows, ncols, rand_low, rand_high)
     else:
         assert False
 
 
-def generate_dataframe(impl, data_type, ncols, nrows, rand_low, rand_high):
-    cache_key = (impl, data_type, ncols, nrows, rand_low, rand_high)
+def generate_dataframe(
+    impl,
+    data_type,
+    nrows,
+    ncols,
+    rand_low,
+    rand_high,
+    groupby_ncols=None,
+    count_groups=None,
+):
+    if groupby_ncols and count_groups:
+        cache_key = (
+            impl,
+            data_type,
+            nrows,
+            ncols,
+            rand_low,
+            rand_high,
+            groupby_ncols,
+            count_groups,
+        )
+    else:
+        cache_key = (impl, data_type, nrows, ncols, rand_low, rand_high)
+
     if cache_key in dataframes_cache:
         return dataframes_cache[cache_key]
 
@@ -85,13 +107,24 @@ def generate_dataframe(impl, data_type, ncols, nrows, rand_low, rand_high):
             impl, data_type, nrows, ncols, rand_low, rand_high
         )
     )
-    data = gen_data(data_type, ncols, nrows, rand_low, rand_high)
+    data = gen_data(data_type, nrows, ncols, rand_low, rand_high)
+
+    if groupby_ncols and count_groups:
+        groupby_columns = [f"groupby_col{x}" for x in range(groupby_ncols)]
+        for groupby_col in groupby_columns:
+            data[groupby_col] = np.tile(np.arange(count_groups), nrows // count_groups)
+
     if impl == "modin":
         df = pd.DataFrame(data)
     elif impl == "pandas":
         df = pandas.DataFrame(data)
     else:
         assert False
+
+    if groupby_ncols and count_groups:
+        dataframes_cache[cache_key] = df, groupby_columns
+        return df, groupby_columns
+
     dataframes_cache[cache_key] = df
     return df