Skip to content

Commit

Permalink
BENCH: add some cases for join and merge ops from pandas (#5021)
Browse files Browse the repository at this point in the history
Signed-off-by: Myachev <[email protected]>
  • Loading branch information
jbrockmendel authored Oct 10, 2022
1 parent d005429 commit abcf1e9
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 1 deletion.
63 changes: 62 additions & 1 deletion asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# measurements

import numpy as np
import pandas._testing as tm

from .utils import (
generate_dataframe,
Expand Down Expand Up @@ -127,12 +128,56 @@ def time_join(self, shapes, how, sort):
execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))


class TimeJoinStringIndex:
param_names = ["shapes", "sort"]
params = [
get_benchmark_shapes("TimeJoinStringIndex"),
[True, False],
]

def setup(self, shapes, sort):
assert shapes[0] % 100 == 0, "implementation restriction"
level1 = tm.makeStringIndex(10).values
level2 = tm.makeStringIndex(shapes[0] // 100).values
codes1 = np.arange(10).repeat(shapes[0] // 100)
codes2 = np.tile(np.arange(shapes[0] // 100), 10)
index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
self.df_multi = IMPL.DataFrame(
np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
)

self.key1 = np.tile(level1.take(codes1), 10)
self.key2 = np.tile(level2.take(codes2), 10)
self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH)
# just to keep source shape
self.df = self.df.drop(columns=self.df.columns[-2:])
self.df["key1"] = self.key1
self.df["key2"] = self.key2
execute(self.df)

self.df_key1 = IMPL.DataFrame(
np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
)
self.df_key2 = IMPL.DataFrame(
np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
)

def time_join_dataframe_index_multi(self, shapes, sort):
execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort))

def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
execute(self.df.join(self.df_key2, on="key2", sort=sort))

def time_join_dataframe_index_single_key_small(self, shapes, sort):
execute(self.df.join(self.df_key1, on="key1", sort=sort))


class TimeMerge:
param_names = ["shapes", "how", "sort"]
params = [
get_benchmark_shapes("TimeMerge"),
["left", "inner"],
[False],
[True, False],
]

def setup(self, shapes, how, sort):
Expand All @@ -147,6 +192,19 @@ def time_merge(self, shapes, how, sort):
)
)

def time_merge_default(self, shapes, how, sort):
execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort))

def time_merge_dataframe_empty_right(self, shapes, how, sort):
# Getting an empty dataframe using `iloc` should be very fast,
# so the impact on the time of the merge operation should be negligible.
execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort))

def time_merge_dataframe_empty_left(self, shapes, how, sort):
# Getting an empty dataframe using `iloc` should be very fast,
# so the impact on the time of the merge operation should be negligible.
execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort))


class TimeMergeCategoricals:
param_names = ["shapes", "data_type"]
Expand Down Expand Up @@ -759,3 +817,6 @@ def time_columns(self, shape):

def time_index(self, shape):
return self.df.index


from .utils import setup # noqa: E402, F401
2 changes: 2 additions & 0 deletions asv_bench/benchmarks/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
random_booleans,
translator_groupby_ngroups,
trigger_import,
setup,
)

__all__ = [
Expand All @@ -54,4 +55,5 @@
"random_booleans",
"translator_groupby_ngroups",
"trigger_import",
"setup",
]
7 changes: 7 additions & 0 deletions asv_bench/benchmarks/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,3 +594,10 @@ def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list):
df.to_parquet(test_filenames[shape_id], index=False)

return test_filenames


def setup(*args, **kwargs): # noqa: GL08
# This function just needs to be imported into each benchmark file to
# set up the random seed before each function. ASV run it automatically.
# https://asv.readthedocs.io/en/latest/writing_benchmarks.html
np.random.seed(42)
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/utils/data_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@
DEFAULT_CONFIG["MergeCategoricals"] = (
[[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
)
DEFAULT_CONFIG["TimeJoinStringIndex"] = (
[[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]]
)
for config in (_DEFAULT_CONFIG_T, _DEFAULT_HDK_CONFIG_T):
for _shape, _names in config:
DEFAULT_CONFIG.update({_name: _shape for _name in _names})
Expand Down

0 comments on commit abcf1e9

Please sign in to comment.