Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

353 create imputation markers #14

Merged
merged 20 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9ddd6af
Change unit tests from dropping to selecting, ready for adding more c…
Jday7879 May 15, 2024
1fbbd83
Adding module to calculate imputation flag columns
Jday7879 May 16, 2024
70dfad4
Creating unit test and test data for imputation flag
Jday7879 May 16, 2024
9bd4c2a
Copying input data to fix pandas copy warnings
Jday7879 May 16, 2024
f334147
Adding docstrings
Jday7879 May 16, 2024
2bd4b04
Refactoring `matched_pair` column to include target column in name
Jday7879 May 16, 2024
122610b
Update impute flags to include impute from construction
Jday7879 May 16, 2024
f1372f0
Create function to convert impute flags into single column with strings
Jday7879 May 16, 2024
f1abca8
Fixing pandas copy on slice warning
Jday7879 May 17, 2024
77855a5
Updating docstring and handle case where needed columns are not included
Jday7879 May 17, 2024
0607562
Update error message
Jday7879 May 17, 2024
e24f451
Adding unit test for string flag column
Jday7879 May 17, 2024
052c376
Renaming imputation flag function to imputation_flag_marker
Jday7879 May 21, 2024
fc56bd0
Rename column in test data
Jday7879 May 21, 2024
1501e0e
Refactor to use dictionary to store imputation markers and conditions…
Jday7879 May 21, 2024
e8458ff
Refactor to define column names earlier in function
Jday7879 May 21, 2024
a88482f
Add f_predictive_auxiliary variable to test data
AntonZogk May 22, 2024
df6930b
refactor: Add predictive_auxiliary as function argument
AntonZogk May 22, 2024
c4a7256
Change period type to int
AntonZogk May 22, 2024
7ca00a7
Update expected columns in function and tests
AntonZogk May 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def flag_matched_pair_merge(
time_difference = -time_difference

# Creating new DF, shifting period for forward or backward
df_with_predictive_column = df[[reference, strata, target]]
df_with_predictive_column = df.copy()[[reference, strata, target]]
df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
months=time_difference
)
Expand All @@ -55,7 +55,7 @@ def flag_matched_pair_merge(
how="left",
)

matched_col_name = forward_or_backward + "_matched_pair"
matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1), False, True
Expand Down Expand Up @@ -107,7 +107,7 @@ def flag_matched_pair_shift(
df["validate_date"] = np.where(
df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
)
matched_col_name = forward_or_backward + "_matched_pair"
matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
Expand Down
115 changes: 115 additions & 0 deletions src/imputation_flags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import numpy as np

from src.flag_and_count_matched_pairs import flag_matched_pair_merge


def create_impute_flags(df, target, reference, strata, auxiliary):
AntonZogk marked this conversation as resolved.
Show resolved Hide resolved
"""
function to create logical columns for each type of imputation
output columns are needed to create the string flag column for
imputation methods.
Function requires f_predictive and b_predictive columns produced
by `flag_matched_pair` function

Parameters
----------
df : pd.DataFrame
DataFrame containing forward, backward predictive period columns (
These columns are created by calling flag_matched_pair_merge forward
and backwards)

target : str
column name containing target variable
reference : str
column name containing business reference id
strata : str
column name containing strata information (sic)
auxiliary : str
column name containing auxiliary data

Returns
-------
pd.DataFrame
Dataframe with four additional logical columns determining if target
is a return (r_flag) can be imputed by forward imputation (fir_flag),
backward imputation (bir_flag) or can be constructed (c_flag)
"""
for direction in ["f", "b"]:
try:
df["{}_predictive_{}".format(direction, target)]
except KeyError:
raise KeyError(
"Dataframe needs column '{}_predictive_{}',".format(direction, target)
+ " run flag_matched_pair function first"
)

df["f_predictive_" + target + "_roll"] = df.groupby([reference, strata])[
Jday7879 marked this conversation as resolved.
Show resolved Hide resolved
"f_predictive_" + target
].ffill()
df["b_predictive_" + target + "_roll"] = df.groupby([reference, strata])[
"b_predictive_" + target
].bfill()

df["r_flag"] = df[target].notna()

df["fir_flag"] = np.where(
df["f_predictive_" + target + "_roll"].notna() & df[target].isna(), True, False
)

df["bir_flag"] = np.where(
df["b_predictive_" + target + "_roll"].notna() & df[target].isna(), True, False
)

construction_conditions = df[target].isna() & df[auxiliary].notna()
df["c_flag"] = np.where(construction_conditions, True, False)

df = flag_matched_pair_merge(
AntonZogk marked this conversation as resolved.
Show resolved Hide resolved
df=df,
forward_or_backward="f",
target="auxiliary",
period="period",
reference="reference",
strata="strata",
)

df["f_predictive_" + auxiliary + "_roll"] = df.groupby([reference, strata])[
"f_predictive_" + auxiliary
].ffill()
fic_conditions = (
df[target].isna() & df["f_predictive_" + auxiliary + "_roll"].notna()
)
df["fic_flag"] = np.where(fic_conditions, True, False)

df.drop(
[
"f_predictive_" + target + "_roll",
"b_predictive_" + target + "_roll",
"f_predictive_" + auxiliary,
"f_predictive_" + auxiliary + "_roll",
"f_matched_pair_" + auxiliary,
],
axis=1,
inplace=True,
)

return df


def generate_imputation_flag_string(df):
Jday7879 marked this conversation as resolved.
Show resolved Hide resolved
Jday7879 marked this conversation as resolved.
Show resolved Hide resolved
imputation_flag_conditions = [
Jday7879 marked this conversation as resolved.
Show resolved Hide resolved
df["r_flag"],
~df["r_flag"] & df["fir_flag"],
~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"],
~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"],
~df["r_flag"]
& ~df["fir_flag"]
& ~df["bir_flag"]
& ~df["fic_flag"]
& df["c_flag"],
]
flags = ["r", "fir", "bir", "fic", "c"]
df["imputation_flag"] = np.select(
imputation_flag_conditions, flags, default="error"
)

return df
28 changes: 28 additions & 0 deletions tests/imputation_flag_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,imputation_flag
1,100,202001,8444,51,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
1,100,202002,,51,8444,2003,FALSE,TRUE,TRUE,TRUE,TRUE,fir
1,100,202003,2003,51,,1003,TRUE,FALSE,FALSE,FALSE,FALSE,r
1,100,202004,1003,51,2003,,TRUE,FALSE,FALSE,FALSE,FALSE,r
2,100,202001,,72,,,FALSE,FALSE,TRUE,TRUE,FALSE,bir
2,100,202002,,,,,FALSE,FALSE,TRUE,FALSE,TRUE,bir
2,100,202003,,72,,3251,FALSE,FALSE,TRUE,TRUE,TRUE,bir
2,100,202004,3251,72,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
3,100,202001,,7,,7511,FALSE,FALSE,TRUE,TRUE,FALSE,bir
3,100,202002,7511,7,,1234,TRUE,FALSE,FALSE,FALSE,FALSE,r
3,100,202003,1234,7,7511,1214,TRUE,FALSE,FALSE,FALSE,FALSE,r
3,100,202004,1214,7,1234,,TRUE,FALSE,FALSE,FALSE,FALSE,r
4,100,202001,64,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
4,100,202002,,81,64,,FALSE,TRUE,TRUE,TRUE,TRUE,fir
4,100,202003,,81,,254,FALSE,TRUE,TRUE,TRUE,TRUE,fir
4,100,202004,254,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
5,100,202001,65,81,,342,TRUE,FALSE,FALSE,FALSE,FALSE,r
5,100,202002,342,81,65,634,TRUE,FALSE,FALSE,FALSE,FALSE,r
5,100,202003,634,81,342,254,TRUE,FALSE,FALSE,FALSE,FALSE,r
5,100,202004,254,81,634,,TRUE,FALSE,FALSE,FALSE,FALSE,r
6,100,202001,64,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
6,100,202002,,81,64,654,FALSE,TRUE,TRUE,TRUE,TRUE,fir
6,100,202003,654,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
6,100,202004,,81,654,,FALSE,TRUE,FALSE,TRUE,TRUE,fir
7,100,202001,,40,,,FALSE,FALSE,FALSE,TRUE,FALSE,c
7,100,202002,,,,,FALSE,FALSE,FALSE,FALSE,TRUE,fic
7,100,202003,,,,,FALSE,FALSE,FALSE,FALSE,TRUE,fic
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case1_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,1
1,101,202403,,False,1,False,0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case2_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,1
1,101,202403,,False,1,False,0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case3_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,0
1,101,202403,,False,0,False,0
Expand Down
106 changes: 76 additions & 30 deletions tests/test_flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@
class TestMatchedPair:
def test_flag_matched_pair_merge_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):

def test_flag_matched_pair_merge_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):

def test_count_matched_pair_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
"f_matched_pair_count",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable", "f_matched_pair"]
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
df_output = count_matches(
df_input, "f_matched_pair_target_variable", "period", "strata"
)
assert_frame_equal(df_output, df_expected_output)

def test_count_matches_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
"b_matched_pair_count",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable", "b_matched_pair"]
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
df_output = count_matches(
df_input, "b_matched_pair_target_variable", "period", "strata"
)
assert_frame_equal(df_output, df_expected_output)

def test_flag_matched_pair_shift_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):

def test_flag_matched_pair_shift_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand Down
45 changes: 45 additions & 0 deletions tests/test_imputation_flags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path

import pytest
from helper_functions import load_and_format
from pandas.testing import assert_frame_equal

from src.imputation_flags import create_impute_flags, generate_imputation_flag_string


@pytest.fixture(scope="class")
def imputation_flag_test_data():
return load_and_format(Path("tests") / "imputation_flag_data.csv")


class TestImputationFlags:
def test_create_impute_flags(self, imputation_flag_test_data):
df_expected_output = imputation_flag_test_data.copy()
df_expected_output.drop(["imputation_flag"], axis=1, inplace=True)
df_input = df_expected_output.copy()
df_input = df_input[
[
"reference",
"strata",
"period",
"target_variable",
"auxiliary",
"f_predictive_target_variable",
"b_predictive_target_variable",
]
]
df_output = create_impute_flags(
df=df_input,
target="target_variable",
reference="reference",
strata="strata",
auxiliary="auxiliary",
)
assert_frame_equal(df_output, df_expected_output)

def test_imputation_flag_strings(self, imputation_flag_test_data):
df_expected_output = imputation_flag_test_data.copy()
df_input = imputation_flag_test_data.copy()
df_input.drop("imputation_flag", axis=1, inplace=True)
df_output = generate_imputation_flag_string(df_input)
assert_frame_equal(df_output, df_expected_output)
Loading