353 create imputation markers (#14)

* Change unit tests from dropping to selecting, ready for adding more cols into test data * Adding module to calculate imputation flag columns * Creating unit test and test data for imputation flag * Copying input data to fix pandas copy warnings * Adding docstrings * Refactoring `matched_pair` column to include target column in name * Update impute flags to include impute from construction * Create function to convert impute flags into single column with strings * Fixing pandas copy on slice warning * Updating docstring and handle case where needed columns are not included * Update error message * Adding unit test for string flag column * Renaming imputation flag function to imputation_flag_marker * Rename column in test data * Refactor to use dictionary to store imputation markers and conditions (can be extracted to yaml file if needed) * Refactor to define column names earlier in function * Add f_predictive_auxiliary variable to test data * refactor: Add predictive_auxiliary as function argument Instead of calling flag_matched_pair_merge within the function to create the predictive_auxiliary, it is defined as function argument. Hence flag_matched_pair_merge must be called before create_impute_flags. This will convert flag_matched_pair_merge to a low level function and using pandas framework. * Change period type to int * Update expected columns in function and tests --------- Co-authored-by: zogkoa <[email protected]>
ONSdigital · May 22, 2024 · a27bb91 · a27bb91
1 parent 0247a01
commit a27bb91
Show file tree

Hide file tree

Showing 8 changed files with 297 additions and 36 deletions.
diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
@@ -39,7 +39,7 @@ def flag_matched_pair_merge(
         time_difference = -time_difference
 
     # Creating new DF, shifting period for forward or backward
-    df_with_predictive_column = df[[reference, strata, target]]
+    df_with_predictive_column = df.copy()[[reference, strata, target]]
     df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
         months=time_difference
     )
@@ -55,7 +55,7 @@ def flag_matched_pair_merge(
         how="left",
     )
 
-    matched_col_name = forward_or_backward + "_matched_pair"
+    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
     df[matched_col_name] = np.where(
         df[[target, predictive_col_name]].isnull().any(axis=1), False, True
@@ -107,7 +107,7 @@ def flag_matched_pair_shift(
     df["validate_date"] = np.where(
         df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
     )
-    matched_col_name = forward_or_backward + "_matched_pair"
+    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
     df[matched_col_name] = np.where(
         df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),

diff --git a/src/imputation_flags.py b/src/imputation_flags.py
@@ -0,0 +1,137 @@
+import numpy as np
+import pandas as pd
+
+
+def create_impute_flags(
+    df: pd.DataFrame,
+    target: str,
+    reference: str,
+    strata: str,
+    auxiliary: str,
+    predictive_auxiliary: str,
+):
+
+    """
+    function to create logical columns for each type of imputation
+    output columns are needed to create the string flag column for
+    imputation methods.
+    Function requires f_predictive and b_predictive columns produced
+    by `flag_matched_pair` function.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing forward, backward predictive period columns (
+        These columns are created by calling flag_matched_pair_merge forward
+        and backwards)
+
+    target : str
+        Column name containing target variable.
+    reference : str
+        Column name containing business reference id.
+    strata : str
+        Column name containing strata information (sic).
+    auxiliary : str
+        Column name containing auxiliary data.
+    predictive_auxiliary: str
+        Column name containing predictive auxiliary data, this is created,
+        by flag_matched_pair_merge function.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with four additional logical columns determining if target
+        is a return (r_flag) can be imputed by forward imputation (fir_flag),
+        backward imputation (bir_flag) or can be constructed (c_flag)
+    """
+    for direction in ["f", "b"]:
+        try:
+            df["{}_predictive_{}".format(direction, target)]
+        except KeyError:
+            raise KeyError(
+                "Dataframe needs column '{}_predictive_{}',".format(direction, target)
+                + " run flag_matched_pair function first"
+            )
+    forward_target_roll = "f_predictive_" + target + "_roll"
+    backward_target_roll = "b_predictive_" + target + "_roll"
+    forward_aux_roll = "f_predictive_" + auxiliary + "_roll"
+
+    df[forward_target_roll] = df.groupby([reference, strata])[
+        "f_predictive_" + target
+    ].ffill()
+
+    df[backward_target_roll] = df.groupby([reference, strata])[
+        "b_predictive_" + target
+    ].bfill()
+
+    df["r_flag"] = df[target].notna()
+
+    df["fir_flag"] = np.where(
+        df[forward_target_roll].notna() & df[target].isna(), True, False
+    )
+
+    df["bir_flag"] = np.where(
+        df[backward_target_roll].notna() & df[target].isna(), True, False
+    )
+
+    construction_conditions = df[target].isna() & df[auxiliary].notna()
+    df["c_flag"] = np.where(construction_conditions, True, False)
+
+    df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill()
+
+    fic_conditions = df[target].isna() & df[forward_aux_roll].notna()
+    df["fic_flag"] = np.where(fic_conditions, True, False)
+
+    df.drop(
+        [
+            forward_target_roll,
+            backward_target_roll,
+            forward_aux_roll,
+            predictive_auxiliary,
+        ],
+        axis=1,
+        inplace=True,
+    )
+
+    return df
+
+
+def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Function to add column containing the a string indicating the method of
+    imputation to use following the hierarchy in specifications
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing logical columns produced by `create_imputation_flags`
+        (r_flag, fir_flag, bir_flag, fic_flag and c_flag)
+
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with additional column containing imputation marker
+        i.e. the type of imputation method that should be used to fill
+        missing returns.
+    """
+
+    imputation_markers_and_conditions = {
+        "r": df["r_flag"],
+        "fir": ~df["r_flag"] & df["fir_flag"],
+        "bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"],
+        "fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"],
+        "c": ~df["r_flag"]
+        & ~df["fir_flag"]
+        & ~df["bir_flag"]
+        & ~df["fic_flag"]
+        & df["c_flag"],
+    }
+
+    df["imputation_marker"] = np.select(
+        imputation_markers_and_conditions.values(),
+        imputation_markers_and_conditions.keys(),
+        default="error",
+    )
+
+    return df
diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv
@@ -0,0 +1,28 @@
+reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker
+1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r
+1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir
+1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r
+1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r
+2,100,202001,,72.0,,,False,False,True,True,False,,bir
+2,100,202002,,,,,False,False,True,False,True,72.0,bir
+2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir
+2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r
+3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir
+3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r
+3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r
+3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r
+4,100,202001,64.0,81.0,,,True,False,False,False,False,,r
+4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir
+4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir
+4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r
+5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r
+5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r
+5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r
+5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r
+6,100,202001,64.0,81.0,,,True,False,False,False,False,,r
+6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir
+6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r
+6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir
+7,100,202001,,40.0,,,False,False,False,True,False,,c
+7,100,202002,,,,,False,False,False,False,True,40.0,fic
+7,100,202003,,,,,False,False,False,False,True,,fic
diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,1
 1,101,202403,,False,1,False,0

diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,1
 1,101,202403,,False,1,False,0

diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,0
 1,101,202403,,False,0,False,0

diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py
@@ -28,11 +28,15 @@
 class TestMatchedPair:
     def test_flag_matched_pair_merge_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):
 
     def test_flag_matched_pair_merge_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):
 
     def test_count_matched_pair_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+                "f_matched_pair_count",
+            ]
+        ]
         df_input = df_expected_output[
-            ["reference", "strata", "period", "target_variable", "f_matched_pair"]
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
         ]
-        df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
+        df_output = count_matches(
+            df_input, "f_matched_pair_target_variable", "period", "strata"
+        )
         assert_frame_equal(df_output, df_expected_output)
 
     def test_count_matches_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+                "b_matched_pair_count",
+            ]
+        ]
         df_input = df_expected_output[
-            ["reference", "strata", "period", "target_variable", "b_matched_pair"]
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
         ]
-        df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
+        df_output = count_matches(
+            df_input, "b_matched_pair_target_variable", "period", "strata"
+        )
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_shift_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):
 
     def test_flag_matched_pair_shift_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]