Skip to content

Commit

Permalink
353 create imputation markers (#14)
Browse files Browse the repository at this point in the history
* Change unit tests from dropping to selecting, ready for adding more cols into test data

* Adding module to calculate imputation flag columns

* Creating unit test and test data for imputation flag

* Copying input data to fix pandas copy warnings

* Adding docstrings

* Refactoring `matched_pair` column to include target column in name

* Update impute flags to include impute from construction

* Create function to convert impute flags into single column with strings

* Fixing pandas copy on slice warning

* Updating docstring and handle case where needed columns are not included

* Update error message

* Adding unit test for string flag column

* Renaming imputation flag function to imputation_flag_marker

* Rename column in test data

* Refactor to use dictionary to store imputation markers and conditions (can be extracted to yaml file if needed)

* Refactor to define column names earlier in function

* Add f_predictive_auxiliary variable to test data

* refactor: Add predictive_auxiliary as function argument

Instead of calling flag_matched_pair_merge within the function to create the predictive_auxiliary, it is defined as function argument. Hence flag_matched_pair_merge must be called before create_impute_flags.  This will convert flag_matched_pair_merge  to a low level function and using pandas framework.

* Change period type to int

* Update expected columns in function and tests

---------

Co-authored-by: zogkoa <[email protected]>
  • Loading branch information
Jday7879 and AntonZogk authored May 22, 2024
1 parent 0247a01 commit a27bb91
Show file tree
Hide file tree
Showing 8 changed files with 297 additions and 36 deletions.
6 changes: 3 additions & 3 deletions src/flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def flag_matched_pair_merge(
time_difference = -time_difference

# Creating new DF, shifting period for forward or backward
df_with_predictive_column = df[[reference, strata, target]]
df_with_predictive_column = df.copy()[[reference, strata, target]]
df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
months=time_difference
)
Expand All @@ -55,7 +55,7 @@ def flag_matched_pair_merge(
how="left",
)

matched_col_name = forward_or_backward + "_matched_pair"
matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1), False, True
Expand Down Expand Up @@ -107,7 +107,7 @@ def flag_matched_pair_shift(
df["validate_date"] = np.where(
df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
)
matched_col_name = forward_or_backward + "_matched_pair"
matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
Expand Down
137 changes: 137 additions & 0 deletions src/imputation_flags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import numpy as np
import pandas as pd


def create_impute_flags(
df: pd.DataFrame,
target: str,
reference: str,
strata: str,
auxiliary: str,
predictive_auxiliary: str,
):

"""
function to create logical columns for each type of imputation
output columns are needed to create the string flag column for
imputation methods.
Function requires f_predictive and b_predictive columns produced
by `flag_matched_pair` function.
Parameters
----------
df : pd.DataFrame
DataFrame containing forward, backward predictive period columns (
These columns are created by calling flag_matched_pair_merge forward
and backwards)
target : str
Column name containing target variable.
reference : str
Column name containing business reference id.
strata : str
Column name containing strata information (sic).
auxiliary : str
Column name containing auxiliary data.
predictive_auxiliary: str
Column name containing predictive auxiliary data, this is created,
by flag_matched_pair_merge function.
Returns
-------
pd.DataFrame
Dataframe with four additional logical columns determining if target
is a return (r_flag) can be imputed by forward imputation (fir_flag),
backward imputation (bir_flag) or can be constructed (c_flag)
"""
for direction in ["f", "b"]:
try:
df["{}_predictive_{}".format(direction, target)]
except KeyError:
raise KeyError(
"Dataframe needs column '{}_predictive_{}',".format(direction, target)
+ " run flag_matched_pair function first"
)
forward_target_roll = "f_predictive_" + target + "_roll"
backward_target_roll = "b_predictive_" + target + "_roll"
forward_aux_roll = "f_predictive_" + auxiliary + "_roll"

df[forward_target_roll] = df.groupby([reference, strata])[
"f_predictive_" + target
].ffill()

df[backward_target_roll] = df.groupby([reference, strata])[
"b_predictive_" + target
].bfill()

df["r_flag"] = df[target].notna()

df["fir_flag"] = np.where(
df[forward_target_roll].notna() & df[target].isna(), True, False
)

df["bir_flag"] = np.where(
df[backward_target_roll].notna() & df[target].isna(), True, False
)

construction_conditions = df[target].isna() & df[auxiliary].notna()
df["c_flag"] = np.where(construction_conditions, True, False)

df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill()

fic_conditions = df[target].isna() & df[forward_aux_roll].notna()
df["fic_flag"] = np.where(fic_conditions, True, False)

df.drop(
[
forward_target_roll,
backward_target_roll,
forward_aux_roll,
predictive_auxiliary,
],
axis=1,
inplace=True,
)

return df


def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame:
"""
Function to add column containing the a string indicating the method of
imputation to use following the hierarchy in specifications
Parameters
----------
df : pd.DataFrame
DataFrame containing logical columns produced by `create_imputation_flags`
(r_flag, fir_flag, bir_flag, fic_flag and c_flag)
Returns
-------
pd.DataFrame
Dataframe with additional column containing imputation marker
i.e. the type of imputation method that should be used to fill
missing returns.
"""

imputation_markers_and_conditions = {
"r": df["r_flag"],
"fir": ~df["r_flag"] & df["fir_flag"],
"bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"],
"fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"],
"c": ~df["r_flag"]
& ~df["fir_flag"]
& ~df["bir_flag"]
& ~df["fic_flag"]
& df["c_flag"],
}

df["imputation_marker"] = np.select(
imputation_markers_and_conditions.values(),
imputation_markers_and_conditions.keys(),
default="error",
)

return df
28 changes: 28 additions & 0 deletions tests/imputation_flag_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker
1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r
1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir
1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r
1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r
2,100,202001,,72.0,,,False,False,True,True,False,,bir
2,100,202002,,,,,False,False,True,False,True,72.0,bir
2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir
2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r
3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir
3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r
3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r
3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r
4,100,202001,64.0,81.0,,,True,False,False,False,False,,r
4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir
4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir
4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r
5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r
5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r
5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r
5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r
6,100,202001,64.0,81.0,,,True,False,False,False,False,,r
6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir
6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r
6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir
7,100,202001,,40.0,,,False,False,False,True,False,,c
7,100,202002,,,,,False,False,False,False,True,40.0,fic
7,100,202003,,,,,False,False,False,False,True,,fic
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case1_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,1
1,101,202403,,False,1,False,0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case2_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,1
1,101,202403,,False,1,False,0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case3_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,0
1,101,202403,,False,0,False,0
Expand Down
106 changes: 76 additions & 30 deletions tests/test_flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@
class TestMatchedPair:
def test_flag_matched_pair_merge_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):

def test_flag_matched_pair_merge_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):

def test_count_matched_pair_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
"f_matched_pair_count",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable", "f_matched_pair"]
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
df_output = count_matches(
df_input, "f_matched_pair_target_variable", "period", "strata"
)
assert_frame_equal(df_output, df_expected_output)

def test_count_matches_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
"b_matched_pair_count",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable", "b_matched_pair"]
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
df_output = count_matches(
df_input, "b_matched_pair_target_variable", "period", "strata"
)
assert_frame_equal(df_output, df_expected_output)

def test_flag_matched_pair_shift_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):

def test_flag_matched_pair_shift_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand Down
Loading

0 comments on commit a27bb91

Please sign in to comment.