ONSdigital · robertswh · May 22, 2024 · May 15, 2024 · May 16, 2024 · May 16, 2024
diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
@@ -39,7 +39,7 @@ def flag_matched_pair_merge(
         time_difference = -time_difference
 
     # Creating new DF, shifting period for forward or backward
-    df_with_predictive_column = df[[reference, strata, target]]
+    df_with_predictive_column = df.copy()[[reference, strata, target]]
     df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
         months=time_difference
     )
@@ -55,7 +55,7 @@ def flag_matched_pair_merge(
         how="left",
     )
 
-    matched_col_name = forward_or_backward + "_matched_pair"
+    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
     df[matched_col_name] = np.where(
         df[[target, predictive_col_name]].isnull().any(axis=1), False, True
@@ -107,7 +107,7 @@ def flag_matched_pair_shift(
     df["validate_date"] = np.where(
         df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
     )
-    matched_col_name = forward_or_backward + "_matched_pair"
+    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
     df[matched_col_name] = np.where(
         df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),

diff --git a/src/imputation_flags.py b/src/imputation_flags.py
@@ -0,0 +1,115 @@
+import numpy as np
+
+from src.flag_and_count_matched_pairs import flag_matched_pair_merge
+
+
+def create_impute_flags(df, target, reference, strata, auxiliary):
+    """
+    function to create logical columns for each type of imputation
+    output columns are needed to create the string flag column for
+    imputation methods.
+    Function requires f_predictive and b_predictive columns produced
+    by `flag_matched_pair` function
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing forward, backward predictive period columns (
+        These columns are created by calling flag_matched_pair_merge forward
+        and backwards)
+
+    target : str
+        column name containing target variable
+    reference : str
+        column name containing business reference id
+    strata : str
+        column name containing strata information (sic)
+    auxiliary : str
+        column name containing auxiliary data
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with four additional logical columns determining if target
+        is a return (r_flag) can be imputed by forward imputation (fir_flag),
+        backward imputation (bir_flag) or can be constructed (c_flag)
+    """
+    for direction in ["f", "b"]:
+        try:
+            df["{}_predictive_{}".format(direction, target)]
+        except KeyError:
+            raise KeyError(
+                "Dataframe needs column '{}_predictive_{}',".format(direction, target)
+                + " run flag_matched_pair function first"
+            )
+
+    df["f_predictive_" + target + "_roll"] = df.groupby([reference, strata])[
+        "f_predictive_" + target
+    ].ffill()
+    df["b_predictive_" + target + "_roll"] = df.groupby([reference, strata])[
+        "b_predictive_" + target
+    ].bfill()
+
+    df["r_flag"] = df[target].notna()
+
+    df["fir_flag"] = np.where(
+        df["f_predictive_" + target + "_roll"].notna() & df[target].isna(), True, False
+    )
+
+    df["bir_flag"] = np.where(
+        df["b_predictive_" + target + "_roll"].notna() & df[target].isna(), True, False
+    )
+
+    construction_conditions = df[target].isna() & df[auxiliary].notna()
+    df["c_flag"] = np.where(construction_conditions, True, False)
+
+    df = flag_matched_pair_merge(
+        df=df,
+        forward_or_backward="f",
+        target="auxiliary",
+        period="period",
+        reference="reference",
+        strata="strata",
+    )
+
+    df["f_predictive_" + auxiliary + "_roll"] = df.groupby([reference, strata])[
+        "f_predictive_" + auxiliary
+    ].ffill()
+    fic_conditions = (
+        df[target].isna() & df["f_predictive_" + auxiliary + "_roll"].notna()
+    )
+    df["fic_flag"] = np.where(fic_conditions, True, False)
+
+    df.drop(
+        [
+            "f_predictive_" + target + "_roll",
+            "b_predictive_" + target + "_roll",
+            "f_predictive_" + auxiliary,
+            "f_predictive_" + auxiliary + "_roll",
+            "f_matched_pair_" + auxiliary,
+        ],
+        axis=1,
+        inplace=True,
+    )
+
+    return df
+
+
+def generate_imputation_flag_string(df):
+    imputation_flag_conditions = [
+        df["r_flag"],
+        ~df["r_flag"] & df["fir_flag"],
+        ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"],
+        ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"],
+        ~df["r_flag"]
+        & ~df["fir_flag"]
+        & ~df["bir_flag"]
+        & ~df["fic_flag"]
+        & df["c_flag"],
+    ]
+    flags = ["r", "fir", "bir", "fic", "c"]
+    df["imputation_flag"] = np.select(
+        imputation_flag_conditions, flags, default="error"
+    )
+
+    return df
diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv
@@ -0,0 +1,28 @@
+reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,imputation_flag
+1,100,202001,8444,51,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+1,100,202002,,51,8444,2003,FALSE,TRUE,TRUE,TRUE,TRUE,fir
+1,100,202003,2003,51,,1003,TRUE,FALSE,FALSE,FALSE,FALSE,r
+1,100,202004,1003,51,2003,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+2,100,202001,,72,,,FALSE,FALSE,TRUE,TRUE,FALSE,bir
+2,100,202002,,,,,FALSE,FALSE,TRUE,FALSE,TRUE,bir
+2,100,202003,,72,,3251,FALSE,FALSE,TRUE,TRUE,TRUE,bir
+2,100,202004,3251,72,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+3,100,202001,,7,,7511,FALSE,FALSE,TRUE,TRUE,FALSE,bir
+3,100,202002,7511,7,,1234,TRUE,FALSE,FALSE,FALSE,FALSE,r
+3,100,202003,1234,7,7511,1214,TRUE,FALSE,FALSE,FALSE,FALSE,r
+3,100,202004,1214,7,1234,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+4,100,202001,64,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+4,100,202002,,81,64,,FALSE,TRUE,TRUE,TRUE,TRUE,fir
+4,100,202003,,81,,254,FALSE,TRUE,TRUE,TRUE,TRUE,fir
+4,100,202004,254,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+5,100,202001,65,81,,342,TRUE,FALSE,FALSE,FALSE,FALSE,r
+5,100,202002,342,81,65,634,TRUE,FALSE,FALSE,FALSE,FALSE,r
+5,100,202003,634,81,342,254,TRUE,FALSE,FALSE,FALSE,FALSE,r
+5,100,202004,254,81,634,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+6,100,202001,64,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+6,100,202002,,81,64,654,FALSE,TRUE,TRUE,TRUE,TRUE,fir
+6,100,202003,654,81,,,TRUE,FALSE,FALSE,FALSE,FALSE,r
+6,100,202004,,81,654,,FALSE,TRUE,FALSE,TRUE,TRUE,fir
+7,100,202001,,40,,,FALSE,FALSE,FALSE,TRUE,FALSE,c
+7,100,202002,,,,,FALSE,FALSE,FALSE,FALSE,TRUE,fic
+7,100,202003,,,,,FALSE,FALSE,FALSE,FALSE,TRUE,fic
diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,1
 1,101,202403,,False,1,False,0

diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,1
 1,101,202403,,False,1,False,0

diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv
@@ -1,4 +1,4 @@
-reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
+reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
 1,101,202401,237,False,0,True,2
 1,101,202402,281,True,2,False,0
 1,101,202403,,False,0,False,0

diff --git a/tests/test_flag_and_count_matched_pairs.py b/tests/test_flag_and_count_matched_pairs.py
@@ -28,11 +28,15 @@
 class TestMatchedPair:
     def test_flag_matched_pair_merge_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):
 
     def test_flag_matched_pair_merge_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):
 
     def test_count_matched_pair_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+                "f_matched_pair_count",
+            ]
+        ]
         df_input = df_expected_output[
-            ["reference", "strata", "period", "target_variable", "f_matched_pair"]
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
         ]
-        df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
+        df_output = count_matches(
+            df_input, "f_matched_pair_target_variable", "period", "strata"
+        )
         assert_frame_equal(df_output, df_expected_output)
 
     def test_count_matches_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+                "b_matched_pair_count",
+            ]
+        ]
         df_input = df_expected_output[
-            ["reference", "strata", "period", "target_variable", "b_matched_pair"]
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
         ]
-        df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
+        df_output = count_matches(
+            df_input, "b_matched_pair_target_variable", "period", "strata"
+        )
         assert_frame_equal(df_output, df_expected_output)
 
     def test_flag_matched_pair_shift_forward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "f_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]
@@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):
 
     def test_flag_matched_pair_shift_backward(self, expected_output_file):
         df_expected_output = load_and_format(expected_output_file)
-        df_expected_output.drop(
-            ["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
-            axis=1,
-            inplace=True,
-        )
+        df_expected_output = df_expected_output[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "b_matched_pair_target_variable",
+            ]
+        ]
         df_input = df_expected_output[
             ["reference", "strata", "period", "target_variable"]
         ]

diff --git a/tests/test_imputation_flags.py b/tests/test_imputation_flags.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+
+import pytest
+from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+from src.imputation_flags import create_impute_flags, generate_imputation_flag_string
+
+
+@pytest.fixture(scope="class")
+def imputation_flag_test_data():
+    return load_and_format(Path("tests") / "imputation_flag_data.csv")
+
+
+class TestImputationFlags:
+    def test_create_impute_flags(self, imputation_flag_test_data):
+        df_expected_output = imputation_flag_test_data.copy()
+        df_expected_output.drop(["imputation_flag"], axis=1, inplace=True)
+        df_input = df_expected_output.copy()
+        df_input = df_input[
+            [
+                "reference",
+                "strata",
+                "period",
+                "target_variable",
+                "auxiliary",
+                "f_predictive_target_variable",
+                "b_predictive_target_variable",
+            ]
+        ]
+        df_output = create_impute_flags(
+            df=df_input,
+            target="target_variable",
+            reference="reference",
+            strata="strata",
+            auxiliary="auxiliary",
+        )
+        assert_frame_equal(df_output, df_expected_output)
+
+    def test_imputation_flag_strings(self, imputation_flag_test_data):
+        df_expected_output = imputation_flag_test_data.copy()
+        df_input = imputation_flag_test_data.copy()
+        df_input.drop("imputation_flag", axis=1, inplace=True)
+        df_output = generate_imputation_flag_string(df_input)
+        assert_frame_equal(df_output, df_expected_output)