-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Change unit tests from dropping to selecting, ready for adding more cols into test data * Adding module to calculate imputation flag columns * Creating unit test and test data for imputation flag * Copying input data to fix pandas copy warnings * Adding docstrings * Refactoring `matched_pair` column to include target column in name * Update impute flags to include impute from construction * Create function to convert impute flags into single column with strings * Fixing pandas copy on slice warning * Updating docstring and handle case where needed columns are not included * Update error message * Adding unit test for string flag column * Renaming imputation flag function to imputation_flag_marker * Rename column in test data * Refactor to use dictionary to store imputation markers and conditions (can be extracted to yaml file if needed) * Refactor to define column names earlier in function * Add f_predictive_auxiliary variable to test data * refactor: Add predictive_auxiliary as function argument Instead of calling flag_matched_pair_merge within the function to create the predictive_auxiliary, it is defined as function argument. Hence flag_matched_pair_merge must be called before create_impute_flags. This will convert flag_matched_pair_merge to a low level function and using pandas framework. * Change period type to int * Update expected columns in function and tests --------- Co-authored-by: zogkoa <[email protected]>
- Loading branch information
Showing
8 changed files
with
297 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def create_impute_flags( | ||
df: pd.DataFrame, | ||
target: str, | ||
reference: str, | ||
strata: str, | ||
auxiliary: str, | ||
predictive_auxiliary: str, | ||
): | ||
|
||
""" | ||
function to create logical columns for each type of imputation | ||
output columns are needed to create the string flag column for | ||
imputation methods. | ||
Function requires f_predictive and b_predictive columns produced | ||
by `flag_matched_pair` function. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
DataFrame containing forward, backward predictive period columns ( | ||
These columns are created by calling flag_matched_pair_merge forward | ||
and backwards) | ||
target : str | ||
Column name containing target variable. | ||
reference : str | ||
Column name containing business reference id. | ||
strata : str | ||
Column name containing strata information (sic). | ||
auxiliary : str | ||
Column name containing auxiliary data. | ||
predictive_auxiliary: str | ||
Column name containing predictive auxiliary data, this is created, | ||
by flag_matched_pair_merge function. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Dataframe with four additional logical columns determining if target | ||
is a return (r_flag) can be imputed by forward imputation (fir_flag), | ||
backward imputation (bir_flag) or can be constructed (c_flag) | ||
""" | ||
for direction in ["f", "b"]: | ||
try: | ||
df["{}_predictive_{}".format(direction, target)] | ||
except KeyError: | ||
raise KeyError( | ||
"Dataframe needs column '{}_predictive_{}',".format(direction, target) | ||
+ " run flag_matched_pair function first" | ||
) | ||
forward_target_roll = "f_predictive_" + target + "_roll" | ||
backward_target_roll = "b_predictive_" + target + "_roll" | ||
forward_aux_roll = "f_predictive_" + auxiliary + "_roll" | ||
|
||
df[forward_target_roll] = df.groupby([reference, strata])[ | ||
"f_predictive_" + target | ||
].ffill() | ||
|
||
df[backward_target_roll] = df.groupby([reference, strata])[ | ||
"b_predictive_" + target | ||
].bfill() | ||
|
||
df["r_flag"] = df[target].notna() | ||
|
||
df["fir_flag"] = np.where( | ||
df[forward_target_roll].notna() & df[target].isna(), True, False | ||
) | ||
|
||
df["bir_flag"] = np.where( | ||
df[backward_target_roll].notna() & df[target].isna(), True, False | ||
) | ||
|
||
construction_conditions = df[target].isna() & df[auxiliary].notna() | ||
df["c_flag"] = np.where(construction_conditions, True, False) | ||
|
||
df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill() | ||
|
||
fic_conditions = df[target].isna() & df[forward_aux_roll].notna() | ||
df["fic_flag"] = np.where(fic_conditions, True, False) | ||
|
||
df.drop( | ||
[ | ||
forward_target_roll, | ||
backward_target_roll, | ||
forward_aux_roll, | ||
predictive_auxiliary, | ||
], | ||
axis=1, | ||
inplace=True, | ||
) | ||
|
||
return df | ||
|
||
|
||
def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Function to add column containing the a string indicating the method of | ||
imputation to use following the hierarchy in specifications | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
DataFrame containing logical columns produced by `create_imputation_flags` | ||
(r_flag, fir_flag, bir_flag, fic_flag and c_flag) | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Dataframe with additional column containing imputation marker | ||
i.e. the type of imputation method that should be used to fill | ||
missing returns. | ||
""" | ||
|
||
imputation_markers_and_conditions = { | ||
"r": df["r_flag"], | ||
"fir": ~df["r_flag"] & df["fir_flag"], | ||
"bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"], | ||
"fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"], | ||
"c": ~df["r_flag"] | ||
& ~df["fir_flag"] | ||
& ~df["bir_flag"] | ||
& ~df["fic_flag"] | ||
& df["c_flag"], | ||
} | ||
|
||
df["imputation_marker"] = np.select( | ||
imputation_markers_and_conditions.values(), | ||
imputation_markers_and_conditions.keys(), | ||
default="error", | ||
) | ||
|
||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker | ||
1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r | ||
1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir | ||
1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r | ||
1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r | ||
2,100,202001,,72.0,,,False,False,True,True,False,,bir | ||
2,100,202002,,,,,False,False,True,False,True,72.0,bir | ||
2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir | ||
2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r | ||
3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir | ||
3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r | ||
3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r | ||
3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r | ||
4,100,202001,64.0,81.0,,,True,False,False,False,False,,r | ||
4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir | ||
4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir | ||
4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r | ||
5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r | ||
5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r | ||
5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r | ||
5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r | ||
6,100,202001,64.0,81.0,,,True,False,False,False,False,,r | ||
6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir | ||
6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r | ||
6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir | ||
7,100,202001,,40.0,,,False,False,False,True,False,,c | ||
7,100,202002,,,,,False,False,False,False,True,40.0,fic | ||
7,100,202003,,,,,False,False,False,False,True,,fic |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.