diff --git a/src/predictive_variable.py b/src/predictive_variable.py index cc17aee7..613919ca 100644 --- a/src/predictive_variable.py +++ b/src/predictive_variable.py @@ -1,14 +1,15 @@ import pandas as pd + def shift_by_strata_period( - df: pd.DataFrame, - target: str, - period: str, - strata: str, - reference: str, - time_difference: int, - new_col: str, - **kwargs + df: pd.DataFrame, + target: str, + period: str, + strata: str, + reference: str, + time_difference: int, + new_col: str, + **kwargs ) -> pd.DataFrame: """ It will perform the usual shift by desired time_difference for each value @@ -40,19 +41,15 @@ def shift_by_strata_period( Pandas dataframe of original data with a new column containing the shifted values. """ - - df.sort_values([reference,strata, period], inplace=True) - - df[new_col] = ( - df.groupby(( - ( - df[period] - pd.DateOffset(months=1) - != df.shift(1)[period] - ) - | (df[strata].diff(1) != 0) - | (df[reference].diff(1) != 0) - ) - .cumsum()) - .shift(time_difference)[target]) - - return df \ No newline at end of file + + df.sort_values([reference, strata, period], inplace=True) + + df[new_col] = df.groupby( + ( + (df[period] - pd.DateOffset(months=1) != df.shift(1)[period]) + | (df[strata].diff(1) != 0) + | (df[reference].diff(1) != 0) + ).cumsum() + ).shift(time_difference)[target] + + return df diff --git a/tests/test_predictive_variable.py b/tests/test_predictive_variable.py index 0b3672c0..83a238ef 100644 --- a/tests/test_predictive_variable.py +++ b/tests/test_predictive_variable.py @@ -1,34 +1,37 @@ -from src.predictive_variable import shift_by_strata_period from pathlib import Path + +import pytest from helper_functions import load_and_format from pandas.testing import assert_frame_equal -import pytest +from src.predictive_variable import shift_by_strata_period + @pytest.fixture(scope="class") def predictive_variable_test_data(): return load_and_format(Path("tests") / "predictive_variable.csv") + class TestPredictiveVariable: def test_predictive_variable_forward(self, predictive_variable_test_data): expected_output = predictive_variable_test_data[ - ['identifier', 'period', 'group', 'question', 'other',"f_predictive" - ]] + ["identifier", "period", "group", "question", "other", "f_predictive"] + ] input_data = expected_output.drop(columns="f_predictive") actual_output = shift_by_strata_period( - input_data, "question", "period", "group","identifier",1,"f_predictive" + input_data, "question", "period", "group", "identifier", 1, "f_predictive" ) - actual_output.sort_index(ascending=True,inplace=True) + actual_output.sort_index(ascending=True, inplace=True) assert_frame_equal(actual_output, expected_output) def test_predictive_variable_backward(self, predictive_variable_test_data): expected_output = predictive_variable_test_data[ - ['identifier', 'period', 'group', 'question', 'other',"b_predictive" - ]] + ["identifier", "period", "group", "question", "other", "b_predictive"] + ] input_data = expected_output.drop(columns="b_predictive") actual_output = shift_by_strata_period( - input_data, "question", "period", "group","identifier",-1,"b_predictive" + input_data, "question", "period", "group", "identifier", -1, "b_predictive" ) - actual_output.sort_index(ascending=True,inplace=True) - assert_frame_equal(actual_output, expected_output) \ No newline at end of file + actual_output.sort_index(ascending=True, inplace=True) + assert_frame_equal(actual_output, expected_output)