From 06b0990301b2518e3cbe126ed75faad01e8fd0e5 Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Mon, 6 Sep 2021 17:56:50 +0800 Subject: [PATCH 1/5] feat: add resample operator in post processing --- superset/charts/schemas.py | 1 + superset/utils/pandas_postprocessing.py | 19 +++++++++++++++++++ .../pandas_postprocessing_tests.py | 15 +++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/superset/charts/schemas.py b/superset/charts/schemas.py index 9916c0b221ec2..9a41c54896d2e 100644 --- a/superset/charts/schemas.py +++ b/superset/charts/schemas.py @@ -749,6 +749,7 @@ class ChartDataPostProcessingOperationSchema(Schema): "sort", "diff", "compare", + "resample", ) ), example="aggregate", diff --git a/superset/utils/pandas_postprocessing.py b/superset/utils/pandas_postprocessing.py index ad6c6afedade1..8f866ef10eadb 100644 --- a/superset/utils/pandas_postprocessing.py +++ b/superset/utils/pandas_postprocessing.py @@ -916,3 +916,22 @@ def outliers(series: Series) -> Set[float]: for metric in metrics } return aggregate(df, groupby=groupby, aggregates=aggregates) + + +def resample( + df: DataFrame, resample_rule: str, resample_method: str, time_column: str, +) -> DataFrame: + """ + resample a timeseries dataframe. + + :param df: DataFrame to resample. + :param resample_rule: The offset string representing target conversion. + :param resample_method: How to fill the NaN value after resample. + :param time_column: existing columns in DataFrame. + :return: DataFrame after resample + :raises QueryObjectValidationError: If the request in incorrect + """ + df.set_index(time_column, inplace=True) + df = getattr(df.resample(resample_rule), resample_method)() + df.reset_index(inplace=True) + return df diff --git a/tests/integration_tests/pandas_postprocessing_tests.py b/tests/integration_tests/pandas_postprocessing_tests.py index 1763dad336146..ef6c86dd8f51a 100644 --- a/tests/integration_tests/pandas_postprocessing_tests.py +++ b/tests/integration_tests/pandas_postprocessing_tests.py @@ -870,3 +870,18 @@ def test_boxplot_percentile_incorrect_params(self): metrics=["cars"], percentiles=[10, 90, 10], ) + + def test_resample(self): + timeseries_df.index.name = "time_column" + timeseries_df.reset_index(inplace=True) + + post_df = proc.resample( + df=timeseries_df, + resample_rule="1D", + resample_method="ffill", + time_column="time_column", + ) + self.assertListEqual( + post_df["label"].tolist(), ["x", "y", "y", "y", "z", "z", "q"] + ) + self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0]) From a2bc02dde2f675cae461a6a224623005a92ea422 Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Tue, 7 Sep 2021 10:54:09 +0800 Subject: [PATCH 2/5] wip --- tests/integration_tests/pandas_postprocessing_tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration_tests/pandas_postprocessing_tests.py b/tests/integration_tests/pandas_postprocessing_tests.py index ef6c86dd8f51a..7b667e4aa9ab6 100644 --- a/tests/integration_tests/pandas_postprocessing_tests.py +++ b/tests/integration_tests/pandas_postprocessing_tests.py @@ -872,11 +872,12 @@ def test_boxplot_percentile_incorrect_params(self): ) def test_resample(self): - timeseries_df.index.name = "time_column" - timeseries_df.reset_index(inplace=True) + df = timeseries_df.copy() + df.index.name = "time_column" + df.reset_index(inplace=True) post_df = proc.resample( - df=timeseries_df, + df=df, resample_rule="1D", resample_method="ffill", time_column="time_column", From 9632f67081cbdf80b0bf56b123fab3647af28d99 Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Tue, 7 Sep 2021 15:52:28 +0800 Subject: [PATCH 3/5] fill zero values --- superset/utils/pandas_postprocessing.py | 17 ++++++++++++----- .../pandas_postprocessing_tests.py | 10 ++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/superset/utils/pandas_postprocessing.py b/superset/utils/pandas_postprocessing.py index 8f866ef10eadb..c4b43a83c600d 100644 --- a/superset/utils/pandas_postprocessing.py +++ b/superset/utils/pandas_postprocessing.py @@ -919,7 +919,11 @@ def outliers(series: Series) -> Set[float]: def resample( - df: DataFrame, resample_rule: str, resample_method: str, time_column: str, + df: DataFrame, + resample_rule: str, + resample_method: str, + time_column: str, + resample_fill_zero: bool = False, ) -> DataFrame: """ resample a timeseries dataframe. @@ -928,10 +932,13 @@ def resample( :param resample_rule: The offset string representing target conversion. :param resample_method: How to fill the NaN value after resample. :param time_column: existing columns in DataFrame. + :param resample_fill_zero: fill missing values with zero. :return: DataFrame after resample :raises QueryObjectValidationError: If the request in incorrect """ - df.set_index(time_column, inplace=True) - df = getattr(df.resample(resample_rule), resample_method)() - df.reset_index(inplace=True) - return df + df = df.set_index(time_column) + if resample_method == "asfreq" and resample_fill_zero: + df = df.resample(resample_rule).asfreq(fill_value=0) + else: + df = getattr(df.resample(resample_rule), resample_method)() + return df.reset_index() diff --git a/tests/integration_tests/pandas_postprocessing_tests.py b/tests/integration_tests/pandas_postprocessing_tests.py index 7b667e4aa9ab6..a6523724e8a14 100644 --- a/tests/integration_tests/pandas_postprocessing_tests.py +++ b/tests/integration_tests/pandas_postprocessing_tests.py @@ -886,3 +886,13 @@ def test_resample(self): post_df["label"].tolist(), ["x", "y", "y", "y", "z", "z", "q"] ) self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0]) + + post_df = proc.resample( + df=df, + resample_rule="1D", + resample_method="asfreq", + time_column="time_column", + resample_fill_zero=True, + ) + self.assertListEqual(post_df["label"].tolist(), ["x", "y", 0, 0, "z", 0, "q"]) + self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 0, 0, 3.0, 0, 4.0]) From 574b58f29c48c3e481aa4b490204a3c687f3b07d Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Fri, 17 Sep 2021 14:18:54 +0800 Subject: [PATCH 4/5] updates --- superset/utils/pandas_postprocessing.py | 18 +++++++++--------- .../pandas_postprocessing_tests.py | 11 ++--------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/superset/utils/pandas_postprocessing.py b/superset/utils/pandas_postprocessing.py index c4b43a83c600d..c95ee4fce74ff 100644 --- a/superset/utils/pandas_postprocessing.py +++ b/superset/utils/pandas_postprocessing.py @@ -920,25 +920,25 @@ def outliers(series: Series) -> Set[float]: def resample( df: DataFrame, - resample_rule: str, - resample_method: str, + rule: str, + method: str, time_column: str, - resample_fill_zero: bool = False, + fill_value: Optional[Union[float, int]], ) -> DataFrame: """ resample a timeseries dataframe. :param df: DataFrame to resample. - :param resample_rule: The offset string representing target conversion. - :param resample_method: How to fill the NaN value after resample. + :param rule: The offset string representing target conversion. + :param method: How to fill the NaN value after resample. :param time_column: existing columns in DataFrame. - :param resample_fill_zero: fill missing values with zero. + :param fill_value: What values do fill missing. :return: DataFrame after resample :raises QueryObjectValidationError: If the request in incorrect """ df = df.set_index(time_column) - if resample_method == "asfreq" and resample_fill_zero: - df = df.resample(resample_rule).asfreq(fill_value=0) + if method == "asfreq" and fill_value is not None: + df = df.resample(rule).asfreq(fill_value=fill_value) else: - df = getattr(df.resample(resample_rule), resample_method)() + df = getattr(df.resample(rule), method)() return df.reset_index() diff --git a/tests/integration_tests/pandas_postprocessing_tests.py b/tests/integration_tests/pandas_postprocessing_tests.py index a6523724e8a14..3e8fdcecda826 100644 --- a/tests/integration_tests/pandas_postprocessing_tests.py +++ b/tests/integration_tests/pandas_postprocessing_tests.py @@ -877,10 +877,7 @@ def test_resample(self): df.reset_index(inplace=True) post_df = proc.resample( - df=df, - resample_rule="1D", - resample_method="ffill", - time_column="time_column", + df=df, rule="1D", method="ffill", time_column="time_column", ) self.assertListEqual( post_df["label"].tolist(), ["x", "y", "y", "y", "z", "z", "q"] @@ -888,11 +885,7 @@ def test_resample(self): self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0]) post_df = proc.resample( - df=df, - resample_rule="1D", - resample_method="asfreq", - time_column="time_column", - resample_fill_zero=True, + df=df, rule="1D", method="asfreq", time_column="time_column", fill_value=0, ) self.assertListEqual(post_df["label"].tolist(), ["x", "y", 0, 0, "z", 0, "q"]) self.assertListEqual(post_df["y"].tolist(), [1.0, 2.0, 0, 0, 3.0, 0, 4.0]) From d89de625f150fd6b38a653288144579e74f497aa Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Fri, 17 Sep 2021 15:26:31 +0800 Subject: [PATCH 5/5] fix ut --- superset/utils/pandas_postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superset/utils/pandas_postprocessing.py b/superset/utils/pandas_postprocessing.py index c95ee4fce74ff..23761216a6b02 100644 --- a/superset/utils/pandas_postprocessing.py +++ b/superset/utils/pandas_postprocessing.py @@ -923,7 +923,7 @@ def resample( rule: str, method: str, time_column: str, - fill_value: Optional[Union[float, int]], + fill_value: Optional[Union[float, int]] = None, ) -> DataFrame: """ resample a timeseries dataframe.