From 4ce48c8d719dce4821033f37d974745c0eab9119 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 19 Jun 2024 17:15:18 +0100 Subject: [PATCH] Feat shift_by_strata_period function *Add function which shifts data per strata and also checks for consecutive dates. *Add test data with multiple values for strata *Add unit tests for forward and backward shifting --- src/predictive_variable.py | 58 +++++++++++++++++++++++++++++++ tests/predictive_variable.csv | 16 +++++++++ tests/test_predictive_variable.py | 34 ++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 src/predictive_variable.py create mode 100644 tests/predictive_variable.csv create mode 100644 tests/test_predictive_variable.py diff --git a/src/predictive_variable.py b/src/predictive_variable.py new file mode 100644 index 00000000..cc17aee7 --- /dev/null +++ b/src/predictive_variable.py @@ -0,0 +1,58 @@ +import pandas as pd + +def shift_by_strata_period( + df: pd.DataFrame, + target: str, + period: str, + strata: str, + reference: str, + time_difference: int, + new_col: str, + **kwargs +) -> pd.DataFrame: + """ + It will perform the usual shift by desired time_difference for each value + in strata and for consecutive period. + + Parameters + ---------- + df : pd.DataFrame + Pandas dataframe of original data + target : str + Column name containing target variable to be shifted. + period : str + Column name containing time period. + strata : str + Column name containing strata information (sic). + reference : str + Column name containing business reference id. + time_difference : int + Number of periods to shift. Can be positive or negative. + new_col : str + Column name containing the shifted values. + kwargs : mapping, optional + A dictionary of keyword arguments passed into func. + + + Returns + ------- + df : pd.DataFrame + Pandas dataframe of original data with a new column containing the + shifted values. + """ + + df.sort_values([reference,strata, period], inplace=True) + + df[new_col] = ( + df.groupby(( + ( + df[period] - pd.DateOffset(months=1) + != df.shift(1)[period] + ) + | (df[strata].diff(1) != 0) + | (df[reference].diff(1) != 0) + ) + .cumsum()) + .shift(time_difference)[target]) + + return df \ No newline at end of file diff --git a/tests/predictive_variable.csv b/tests/predictive_variable.csv new file mode 100644 index 00000000..72c27a39 --- /dev/null +++ b/tests/predictive_variable.csv @@ -0,0 +1,16 @@ +identifier,period,group,question,other,f_predictive,b_predictive +10001,202001,1,547,10,,362 +10001,202002,1,362,10,547,895 +10001,202003,1,895,10,362, +10002,202001,1,381,50,,573 +10002,202002,1,573,50,381,214 +10002,202003,1,214,50,573, +10001,202001,2,961,12,,267 +10001,202002,2,267,12,961,314 +10001,202003,2,314,12,267, +10002,202001,2,555,9,,628 +10002,202002,2,628,9,555,736 +10002,202003,2,736,9,628, +10005,202001,1,,18,, +10005,202002,2,,18,,100 +10005,202003,2,100,18,, diff --git a/tests/test_predictive_variable.py b/tests/test_predictive_variable.py new file mode 100644 index 00000000..0b3672c0 --- /dev/null +++ b/tests/test_predictive_variable.py @@ -0,0 +1,34 @@ +from src.predictive_variable import shift_by_strata_period +from pathlib import Path +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +import pytest + +@pytest.fixture(scope="class") +def predictive_variable_test_data(): + return load_and_format(Path("tests") / "predictive_variable.csv") + +class TestPredictiveVariable: + def test_predictive_variable_forward(self, predictive_variable_test_data): + expected_output = predictive_variable_test_data[ + ['identifier', 'period', 'group', 'question', 'other',"f_predictive" + ]] + input_data = expected_output.drop(columns="f_predictive") + actual_output = shift_by_strata_period( + input_data, "question", "period", "group","identifier",1,"f_predictive" + ) + actual_output.sort_index(ascending=True,inplace=True) + assert_frame_equal(actual_output, expected_output) + + def test_predictive_variable_backward(self, predictive_variable_test_data): + + expected_output = predictive_variable_test_data[ + ['identifier', 'period', 'group', 'question', 'other',"b_predictive" + ]] + input_data = expected_output.drop(columns="b_predictive") + actual_output = shift_by_strata_period( + input_data, "question", "period", "group","identifier",-1,"b_predictive" + ) + actual_output.sort_index(ascending=True,inplace=True) + assert_frame_equal(actual_output, expected_output) \ No newline at end of file