From f0192af1899b79f1729781263a86cecf6148cfc4 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 3 Feb 2021 20:40:54 +0300 Subject: [PATCH 1/3] TEST-#2686: add fillna benchmark Signed-off-by: Anatoly Myachev --- asv_bench/benchmarks/benchmarks.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index 78254b98471..ce81149a5e0 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -359,3 +359,29 @@ def setup(self, shape, columns_number, ascending_list): def time_sort_values(self, shape, columns_number, ascending_list): execute(self.df.sort_values(self.columns, ascending=self.ascending)) + + +class TimeFillna: + param_names = ["shape", "limit"] + params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [None, 0.8]] + + def setup(self, shape, limit): + columns = [f"col{x}" for x in range(shape[1])] + + if ASV_USE_IMPL == "modin": + self.df = pd.DataFrame( + np.nan, index=pd.RangeIndex(shape[0]), columns=columns + ) + elif ASV_USE_IMPL == "pandas": + self.df = pandas.DataFrame( + np.nan, index=pandas.RangeIndex(shape[0]), columns=columns + ) + else: + raise NotImplementedError + + self.limit = None + if limit: + self.limit = int(limit * shape[0]) + + def time_fillna(self, shape, limit): + execute(self.df.fillna(0, limit=self.limit)) From 5b718a3e5404efae51b6fb31aae9c62d167f4fca Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Feb 2021 13:00:48 +0300 Subject: [PATCH 2/3] TEST-#2686: reply to review comments Signed-off-by: Anatoly Myachev --- asv_bench/benchmarks/benchmarks.py | 32 ++++++++++-------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index ce81149a5e0..d7bbf820601 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -49,6 +49,8 @@ ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin") ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small") +assert ASV_USE_IMPL in ("modin", "pandas") + BINARY_OP_DATA_SIZE = { "Big": [ ((5000, 5000), (5000, 5000)), @@ -82,6 +84,11 @@ "Small": 5, } +IMPL = { + "modin": pd, + "pandas": pandas, +} + def execute(df): "Make sure the calculations are done." @@ -217,12 +224,7 @@ def setup(self, shapes, how, axis): ) def time_concat(self, shapes, how, axis): - if ASV_USE_IMPL == "modin": - execute(pd.concat([self.df1, self.df2], axis=axis, join=how)) - elif ASV_USE_IMPL == "pandas": - execute(pandas.concat([self.df1, self.df2], axis=axis, join=how)) - else: - raise NotImplementedError + execute(IMPL[ASV_USE_IMPL].concat([self.df1, self.df2], axis=axis, join=how)) class TimeBinaryOp: @@ -366,22 +368,10 @@ class TimeFillna: params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [None, 0.8]] def setup(self, shape, limit): + pd = IMPL[ASV_USE_IMPL] columns = [f"col{x}" for x in range(shape[1])] - - if ASV_USE_IMPL == "modin": - self.df = pd.DataFrame( - np.nan, index=pd.RangeIndex(shape[0]), columns=columns - ) - elif ASV_USE_IMPL == "pandas": - self.df = pandas.DataFrame( - np.nan, index=pandas.RangeIndex(shape[0]), columns=columns - ) - else: - raise NotImplementedError - - self.limit = None - if limit: - self.limit = int(limit * shape[0]) + self.df = pd.DataFrame(np.nan, index=pd.RangeIndex(shape[0]), columns=columns) + self.limit = int(limit * shape[0]) if limit else None def time_fillna(self, shape, limit): execute(self.df.fillna(0, limit=self.limit)) From 8cc85bbdf71fea6298adacde765d8a9d8b0d90df Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Feb 2021 15:27:16 +0300 Subject: [PATCH 3/3] TEST-#2686: add inplace parameter Signed-off-by: Anatoly Myachev --- asv_bench/benchmarks/benchmarks.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index d7bbf820601..2f8f3cba0c2 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -364,14 +364,19 @@ def time_sort_values(self, shape, columns_number, ascending_list): class TimeFillna: - param_names = ["shape", "limit"] - params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [None, 0.8]] + param_names = ["shape", "limit", "inplace"] + params = [UNARY_OP_DATA_SIZE[ASV_DATASET_SIZE], [None, 0.8], [False, True]] - def setup(self, shape, limit): + def setup(self, shape, limit, inplace): pd = IMPL[ASV_USE_IMPL] columns = [f"col{x}" for x in range(shape[1])] self.df = pd.DataFrame(np.nan, index=pd.RangeIndex(shape[0]), columns=columns) self.limit = int(limit * shape[0]) if limit else None - def time_fillna(self, shape, limit): - execute(self.df.fillna(0, limit=self.limit)) + def time_fillna(self, shape, limit, inplace): + kw = {"value": 0.0, "limit": self.limit, "inplace": inplace} + if inplace: + self.df.fillna(**kw) + execute(self.df) + else: + execute(self.df.fillna(**kw))