Skip to content

Commit

Permalink
TEST-modin-project#2670: some updates to improve asv tests stability
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev committed Feb 3, 2021
1 parent e25a5e0 commit 3a1a055
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 51 deletions.
76 changes: 36 additions & 40 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,30 +74,32 @@ def execute(df):


class BaseTimeGroupBy:
def setup(self, data_size, ncols=1):
self.df = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
def setup(self, data_size, groupby_ncols=1):
count_cols = data_size[1] - groupby_ncols
count_groups = 100
self.df, self.groupby_columns = generate_dataframe(
ASV_USE_IMPL,
"int",
data_size[0],
count_cols,
RAND_LOW,
RAND_HIGH,
groupby_ncols,
count_groups,
)
self.groupby_columns = self.df.columns[:ncols].tolist()


class TimeMultiColumnGroupby(BaseTimeGroupBy):
param_names = ["data_size", "ncols"]
class TimeGroupByMultiColumn(BaseTimeGroupBy):
param_names = ["data_size", "groupby_ncols"]
params = [
[
(5000, 5000),
(10_000, 10),
# TODO: after optimization try to use UNARY_OP_DATA_SIZE here
]
if ASV_DATASET_SIZE == "Big"
else UNARY_OP_DATA_SIZE,
UNARY_OP_DATA_SIZE,
[6],
]

def time_groupby_agg_quan(self, data_size, ncols):
def time_groupby_agg_quan(self, data_size, groupby_ncols):
execute(self.df.groupby(by=self.groupby_columns).agg("quantile"))

def time_groupby_agg_mean(self, data_size, ncols):
def time_groupby_agg_mean(self, data_size, groupby_ncols):
execute(self.df.groupby(by=self.groupby_columns).apply(lambda df: df.mean()))


Expand Down Expand Up @@ -150,44 +152,40 @@ class TimeJoin:

def setup(self, data_size, how, sort):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_join(self, data_size, how, sort):
execute(
self.df1.join(
self.df2, on=self.df1.columns[0], how=how, lsuffix="left_", sort=sort
)
)
# join dataframes on index to get the predictable shape
execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))


class TimeMerge:
param_names = ["data_size", "how", "sort"]
params = [
[
(5000, 5000, 5000, 5000),
(125_000, 15, 100_000, 10),
# TODO: after optimization try to use BINARY_OP_DATA_SIZE here
]
if ASV_DATASET_SIZE == "Big"
else BINARY_OP_DATA_SIZE,
BINARY_OP_DATA_SIZE,
["left", "inner"],
[False],
]

def setup(self, data_size, how, sort):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_merge(self, data_size, how, sort):
execute(self.df1.merge(self.df2, on=self.df1.columns[0], how=how, sort=sort))
# merge dataframes by index to get the predictable shape
execute(
self.df1.merge(
self.df2, left_index=True, right_index=True, how=how, sort=sort
)
)


class TimeConcat:
Expand All @@ -199,12 +197,11 @@ class TimeConcat:
]

def setup(self, data_size, how, axis):
# shape for generate_dataframe: first - ncols, second - nrows
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)

def time_concat(self, data_size, how, axis):
Expand All @@ -225,12 +222,11 @@ class TimeBinaryOp:
]

def setup(self, data_size, binary_op, axis):
# shape for generate_dataframe: first - ncols, second - nrows
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", data_size[3], data_size[2], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[2], data_size[3], RAND_LOW, RAND_HIGH
)
self.op = getattr(self.df1, binary_op)

Expand Down Expand Up @@ -260,7 +256,7 @@ def get_loc(df, loc, axis, item_length):

def setup(self, data_size, item_length, loc, is_equal_indices):
self.df = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
).copy()
self.loc, self.iloc = self.get_loc(
self.df, loc, item_length=item_length, axis=1
Expand Down Expand Up @@ -315,7 +311,7 @@ class TimeArithmetic:

def setup(self, data_size, axis):
self.df = generate_dataframe(
ASV_USE_IMPL, "int", data_size[1], data_size[0], RAND_LOW, RAND_HIGH
ASV_USE_IMPL, "int", data_size[0], data_size[1], RAND_LOW, RAND_HIGH
)

def time_sum(self, data_size, axis):
Expand Down
55 changes: 44 additions & 11 deletions asv_bench/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ class weakdict(dict):
dataframes_cache = dict()


def gen_int_data(ncols, nrows, rand_low, rand_high):
cache_key = ("int", ncols, nrows, rand_low, rand_high)
def gen_int_data(nrows, ncols, rand_low, rand_high):
cache_key = ("int", nrows, ncols, rand_low, rand_high)
if cache_key in data_cache:
return data_cache[cache_key]

Expand All @@ -48,8 +48,8 @@ def gen_int_data(ncols, nrows, rand_low, rand_high):
return data


def gen_str_int_data(ncols, nrows, rand_low, rand_high):
cache_key = ("str_int", ncols, nrows, rand_low, rand_high)
def gen_str_int_data(nrows, ncols, rand_low, rand_high):
cache_key = ("str_int", nrows, ncols, rand_low, rand_high)
if cache_key in data_cache:
return data_cache[cache_key]

Expand All @@ -58,25 +58,47 @@ def gen_str_int_data(ncols, nrows, rand_low, rand_high):
nrows, ncols, rand_low, rand_high
)
)
data = gen_int_data(ncols, nrows, rand_low, rand_high).copy()
data = gen_int_data(nrows, ncols, rand_low, rand_high).copy()
data["gb_col"] = [
"str_{}".format(random_state.randint(rand_low, rand_high)) for i in range(nrows)
]
data_cache[cache_key] = weakdict(data)
return data


def gen_data(data_type, ncols, nrows, rand_low, rand_high):
def gen_data(data_type, nrows, ncols, rand_low, rand_high):
if data_type == "int":
return gen_int_data(ncols, nrows, rand_low, rand_high)
return gen_int_data(nrows, ncols, rand_low, rand_high)
elif data_type == "str_int":
return gen_str_int_data(ncols, nrows, rand_low, rand_high)
return gen_str_int_data(nrows, ncols, rand_low, rand_high)
else:
assert False


def generate_dataframe(impl, data_type, ncols, nrows, rand_low, rand_high):
cache_key = (impl, data_type, ncols, nrows, rand_low, rand_high)
def generate_dataframe(
impl,
data_type,
nrows,
ncols,
rand_low,
rand_high,
groupby_ncols=None,
count_groups=None,
):
if groupby_ncols and count_groups:
cache_key = (
impl,
data_type,
nrows,
ncols,
rand_low,
rand_high,
groupby_ncols,
count_groups,
)
else:
cache_key = (impl, data_type, nrows, ncols, rand_low, rand_high)

if cache_key in dataframes_cache:
return dataframes_cache[cache_key]

Expand All @@ -85,13 +107,24 @@ def generate_dataframe(impl, data_type, ncols, nrows, rand_low, rand_high):
impl, data_type, nrows, ncols, rand_low, rand_high
)
)
data = gen_data(data_type, ncols, nrows, rand_low, rand_high)
data = gen_data(data_type, nrows, ncols, rand_low, rand_high)

if groupby_ncols and count_groups:
groupby_columns = [f"groupby_col{x}" for x in range(groupby_ncols)]
for groupby_col in groupby_columns:
data[groupby_col] = np.tile(np.arange(count_groups), nrows // count_groups)

if impl == "modin":
df = pd.DataFrame(data)
elif impl == "pandas":
df = pandas.DataFrame(data)
else:
assert False

if groupby_ncols and count_groups:
dataframes_cache[cache_key] = df, groupby_columns
return df, groupby_columns

dataframes_cache[cache_key] = df
return df

Expand Down

0 comments on commit 3a1a055

Please sign in to comment.