From a5e1fec75d14f992b51fee836256c5859d355a02 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 13 Jun 2024 15:13:43 +0100 Subject: [PATCH 1/5] Fix flags when value not supplied (missing date) * Add fill group for filling * Fill group check if there is a missing date too * Add period argument to function, to check for missing dates --- src/imputation_flags.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/imputation_flags.py b/src/imputation_flags.py index 91bc04ad..98e5e2ec 100644 --- a/src/imputation_flags.py +++ b/src/imputation_flags.py @@ -5,10 +5,12 @@ def create_impute_flags( df: pd.DataFrame, target: str, + period: str, reference: str, strata: str, auxiliary: str, predictive_auxiliary: str, + time_difference=1, ): """ @@ -24,9 +26,10 @@ def create_impute_flags( DataFrame containing forward, backward predictive period columns ( These columns are created by calling flag_matched_pair_merge forward and backwards) - target : str Column name containing target variable. + period: str + Column name containing date variable. reference : str Column name containing business reference id. strata : str @@ -36,6 +39,9 @@ def create_impute_flags( predictive_auxiliary: str Column name containing predictive auxiliary data, this is created, by flag_matched_pair_merge function. + time_difference: int + Lookup distance for matched pairs. + Returns ------- @@ -56,11 +62,25 @@ def create_impute_flags( backward_target_roll = "b_predictive_" + target + "_roll" forward_aux_roll = "f_predictive_" + auxiliary + "_roll" - df[forward_target_roll] = df.groupby([reference, strata])[ + # TODO : similar conditions at cum imputation links + df["fill_group"] = ( + ( + ( + df[period] - pd.DateOffset(months=time_difference) + != df.shift(time_difference)[period] + ) + | (df[strata].diff(time_difference) != 0) + | (df[reference].diff(time_difference) != 0) + ) + .astype("int") + .cumsum() + ) + + df[forward_target_roll] = df.groupby([reference, strata, "fill_group"])[ "f_predictive_" + target ].ffill() - df[backward_target_roll] = df.groupby([reference, strata])[ + df[backward_target_roll] = df.groupby([reference, strata, "fill_group"])[ "b_predictive_" + target ].bfill() @@ -77,7 +97,9 @@ def create_impute_flags( construction_conditions = df[target].isna() & df[auxiliary].notna() df["c_flag"] = np.where(construction_conditions, True, False) - df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill() + df[forward_aux_roll] = df.groupby([reference, strata, "fill_group"])[ + predictive_auxiliary + ].ffill() fic_conditions = df[target].isna() & df[forward_aux_roll].notna() df["fic_flag"] = np.where(fic_conditions, True, False) @@ -88,6 +110,7 @@ def create_impute_flags( backward_target_roll, forward_aux_roll, predictive_auxiliary, + "fill_group", ], axis=1, inplace=True, From e5e87de3cd37cc98b3c54cdd12c9ae89f73a3504 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 13 Jun 2024 15:19:11 +0100 Subject: [PATCH 2/5] Add not supplied scenario to flags test data Add scenario with missing date Adjust test function to use period column --- tests/imputation_flag_data.csv | 5 +++++ tests/test_imputation_flags.py | 1 + 2 files changed, 6 insertions(+) diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv index 31b56aa8..2553e621 100644 --- a/tests/imputation_flag_data.csv +++ b/tests/imputation_flag_data.csv @@ -26,3 +26,8 @@ reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b 7,100,202001,,40.0,,,False,False,False,True,False,,c 7,100,202002,,,,,False,False,False,False,True,40.0,fic 7,100,202003,,,,,False,False,False,False,True,,fic +8,100,202001,789,55,,,TRUE,FALSE,FALSE,FALSE,FALSE,,r +8,100,202002,,66,789,,FALSE,TRUE,FALSE,TRUE,TRUE,55,fir +8,100,202004,,77,,987,FALSE,FALSE,TRUE,TRUE,FALSE,,bir +8,100,202005,987,88,,,TRUE,FALSE,FALSE,FALSE,FALSE,77,r + diff --git a/tests/test_imputation_flags.py b/tests/test_imputation_flags.py index 315b5fa3..5ba671c1 100644 --- a/tests/test_imputation_flags.py +++ b/tests/test_imputation_flags.py @@ -32,6 +32,7 @@ def test_create_impute_flags(self, imputation_flag_test_data): df_output = create_impute_flags( df=df_input, target="target_variable", + period="period", reference="reference", strata="strata", auxiliary="auxiliary", From d0d4cc424ff543142cd3d74c664778768fd30299 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 13 Jun 2024 15:29:27 +0100 Subject: [PATCH 3/5] Remove blank line in the end --- tests/imputation_flag_data.csv | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv index 2553e621..35a22778 100644 --- a/tests/imputation_flag_data.csv +++ b/tests/imputation_flag_data.csv @@ -30,4 +30,3 @@ reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b 8,100,202002,,66,789,,FALSE,TRUE,FALSE,TRUE,TRUE,55,fir 8,100,202004,,77,,987,FALSE,FALSE,TRUE,TRUE,FALSE,,bir 8,100,202005,987,88,,,TRUE,FALSE,FALSE,FALSE,FALSE,77,r - From 1c01a32b141deff5bceeaf1964cc529a80aaab3a Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 19 Jun 2024 11:35:05 +0100 Subject: [PATCH 4/5] Sort values for conditions In order to check if strata, reference difference is not 0 we need to sort the values. --- src/imputation_flags.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/imputation_flags.py b/src/imputation_flags.py index 98e5e2ec..870ff4bd 100644 --- a/src/imputation_flags.py +++ b/src/imputation_flags.py @@ -62,6 +62,8 @@ def create_impute_flags( backward_target_roll = "b_predictive_" + target + "_roll" forward_aux_roll = "f_predictive_" + auxiliary + "_roll" + df.sort_values([reference, strata, period], inplace=True) + # TODO : similar conditions at cum imputation links df["fill_group"] = ( ( From 21110e2f73c71093cff655adcaf8e4e7b5a4f6b6 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 19 Jun 2024 13:07:12 +0100 Subject: [PATCH 5/5] Set time_difference to 1, remove as int * Fill group works only with diff(1) since we are seperating the values by strata and if missing date exists. * astype(int) is redundant, when summing True/False same as 1/0 * Add kwargs and update docstrings --- src/imputation_flags.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/imputation_flags.py b/src/imputation_flags.py index 870ff4bd..984aac78 100644 --- a/src/imputation_flags.py +++ b/src/imputation_flags.py @@ -10,7 +10,7 @@ def create_impute_flags( strata: str, auxiliary: str, predictive_auxiliary: str, - time_difference=1, + **kwargs ): """ @@ -39,9 +39,8 @@ def create_impute_flags( predictive_auxiliary: str Column name containing predictive auxiliary data, this is created, by flag_matched_pair_merge function. - time_difference: int - Lookup distance for matched pairs. - + kwargs : mapping, optional + A dictionary of keyword arguments passed into func. Returns ------- @@ -66,17 +65,10 @@ def create_impute_flags( # TODO : similar conditions at cum imputation links df["fill_group"] = ( - ( - ( - df[period] - pd.DateOffset(months=time_difference) - != df.shift(time_difference)[period] - ) - | (df[strata].diff(time_difference) != 0) - | (df[reference].diff(time_difference) != 0) - ) - .astype("int") - .cumsum() - ) + (df[period] - pd.DateOffset(months=1) != df.shift(1)[period]) + | (df[strata].diff(1) != 0) + | (df[reference].diff(1) != 0) + ).cumsum() df[forward_target_roll] = df.groupby([reference, strata, "fill_group"])[ "f_predictive_" + target