From a5e1fec75d14f992b51fee836256c5859d355a02 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Thu, 13 Jun 2024 15:13:43 +0100
Subject: [PATCH 1/5] Fix flags when value not supplied (missing date)

* Add fill group for filling
* Fill group check if there is a missing date too
* Add period argument to function, to check for missing dates
---
 src/imputation_flags.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/imputation_flags.py b/src/imputation_flags.py
index 91bc04ad..98e5e2ec 100644
--- a/src/imputation_flags.py
+++ b/src/imputation_flags.py
@@ -5,10 +5,12 @@
 def create_impute_flags(
     df: pd.DataFrame,
     target: str,
+    period: str,
     reference: str,
     strata: str,
     auxiliary: str,
     predictive_auxiliary: str,
+    time_difference=1,
 ):
 
     """
@@ -24,9 +26,10 @@ def create_impute_flags(
         DataFrame containing forward, backward predictive period columns (
         These columns are created by calling flag_matched_pair_merge forward
         and backwards)
-
     target : str
         Column name containing target variable.
+    period: str
+        Column name containing date variable.
     reference : str
         Column name containing business reference id.
     strata : str
@@ -36,6 +39,9 @@ def create_impute_flags(
     predictive_auxiliary: str
         Column name containing predictive auxiliary data, this is created,
         by flag_matched_pair_merge function.
+     time_difference: int
+        Lookup distance for matched pairs.
+
 
     Returns
     -------
@@ -56,11 +62,25 @@ def create_impute_flags(
     backward_target_roll = "b_predictive_" + target + "_roll"
     forward_aux_roll = "f_predictive_" + auxiliary + "_roll"
 
-    df[forward_target_roll] = df.groupby([reference, strata])[
+    # TODO : similar conditions at cum imputation links
+    df["fill_group"] = (
+        (
+            (
+                df[period] - pd.DateOffset(months=time_difference)
+                != df.shift(time_difference)[period]
+            )
+            | (df[strata].diff(time_difference) != 0)
+            | (df[reference].diff(time_difference) != 0)
+        )
+        .astype("int")
+        .cumsum()
+    )
+
+    df[forward_target_roll] = df.groupby([reference, strata, "fill_group"])[
         "f_predictive_" + target
     ].ffill()
 
-    df[backward_target_roll] = df.groupby([reference, strata])[
+    df[backward_target_roll] = df.groupby([reference, strata, "fill_group"])[
         "b_predictive_" + target
     ].bfill()
 
@@ -77,7 +97,9 @@ def create_impute_flags(
     construction_conditions = df[target].isna() & df[auxiliary].notna()
     df["c_flag"] = np.where(construction_conditions, True, False)
 
-    df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill()
+    df[forward_aux_roll] = df.groupby([reference, strata, "fill_group"])[
+        predictive_auxiliary
+    ].ffill()
 
     fic_conditions = df[target].isna() & df[forward_aux_roll].notna()
     df["fic_flag"] = np.where(fic_conditions, True, False)
@@ -88,6 +110,7 @@ def create_impute_flags(
             backward_target_roll,
             forward_aux_roll,
             predictive_auxiliary,
+            "fill_group",
         ],
         axis=1,
         inplace=True,

From e5e87de3cd37cc98b3c54cdd12c9ae89f73a3504 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Thu, 13 Jun 2024 15:19:11 +0100
Subject: [PATCH 2/5] Add not supplied scenario to flags test data

Add scenario with missing date
Adjust test function to use period column
---
 tests/imputation_flag_data.csv | 5 +++++
 tests/test_imputation_flags.py | 1 +
 2 files changed, 6 insertions(+)

diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv
index 31b56aa8..2553e621 100644
--- a/tests/imputation_flag_data.csv
+++ b/tests/imputation_flag_data.csv
@@ -26,3 +26,8 @@ reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b
 7,100,202001,,40.0,,,False,False,False,True,False,,c
 7,100,202002,,,,,False,False,False,False,True,40.0,fic
 7,100,202003,,,,,False,False,False,False,True,,fic
+8,100,202001,789,55,,,TRUE,FALSE,FALSE,FALSE,FALSE,,r
+8,100,202002,,66,789,,FALSE,TRUE,FALSE,TRUE,TRUE,55,fir
+8,100,202004,,77,,987,FALSE,FALSE,TRUE,TRUE,FALSE,,bir
+8,100,202005,987,88,,,TRUE,FALSE,FALSE,FALSE,FALSE,77,r
+
diff --git a/tests/test_imputation_flags.py b/tests/test_imputation_flags.py
index 315b5fa3..5ba671c1 100644
--- a/tests/test_imputation_flags.py
+++ b/tests/test_imputation_flags.py
@@ -32,6 +32,7 @@ def test_create_impute_flags(self, imputation_flag_test_data):
         df_output = create_impute_flags(
             df=df_input,
             target="target_variable",
+            period="period",
             reference="reference",
             strata="strata",
             auxiliary="auxiliary",

From d0d4cc424ff543142cd3d74c664778768fd30299 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Thu, 13 Jun 2024 15:29:27 +0100
Subject: [PATCH 3/5] Remove blank line in the end

---
 tests/imputation_flag_data.csv | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/imputation_flag_data.csv b/tests/imputation_flag_data.csv
index 2553e621..35a22778 100644
--- a/tests/imputation_flag_data.csv
+++ b/tests/imputation_flag_data.csv
@@ -30,4 +30,3 @@ reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b
 8,100,202002,,66,789,,FALSE,TRUE,FALSE,TRUE,TRUE,55,fir
 8,100,202004,,77,,987,FALSE,FALSE,TRUE,TRUE,FALSE,,bir
 8,100,202005,987,88,,,TRUE,FALSE,FALSE,FALSE,FALSE,77,r
-

From 1c01a32b141deff5bceeaf1964cc529a80aaab3a Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Wed, 19 Jun 2024 11:35:05 +0100
Subject: [PATCH 4/5] Sort values for conditions

In order to check if strata, reference difference is not 0 we need to
sort the values.
---
 src/imputation_flags.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/imputation_flags.py b/src/imputation_flags.py
index 98e5e2ec..870ff4bd 100644
--- a/src/imputation_flags.py
+++ b/src/imputation_flags.py
@@ -62,6 +62,8 @@ def create_impute_flags(
     backward_target_roll = "b_predictive_" + target + "_roll"
     forward_aux_roll = "f_predictive_" + auxiliary + "_roll"
 
+    df.sort_values([reference, strata, period], inplace=True)
+
     # TODO : similar conditions at cum imputation links
     df["fill_group"] = (
         (

From 21110e2f73c71093cff655adcaf8e4e7b5a4f6b6 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Wed, 19 Jun 2024 13:07:12 +0100
Subject: [PATCH 5/5] Set time_difference to  1, remove as int

* Fill group works only with diff(1) since we are seperating the values
by strata and if missing date exists.
* astype(int) is redundant, when summing True/False same as 1/0
* Add kwargs and update docstrings
---
 src/imputation_flags.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/imputation_flags.py b/src/imputation_flags.py
index 870ff4bd..984aac78 100644
--- a/src/imputation_flags.py
+++ b/src/imputation_flags.py
@@ -10,7 +10,7 @@ def create_impute_flags(
     strata: str,
     auxiliary: str,
     predictive_auxiliary: str,
-    time_difference=1,
+    **kwargs
 ):
 
     """
@@ -39,9 +39,8 @@ def create_impute_flags(
     predictive_auxiliary: str
         Column name containing predictive auxiliary data, this is created,
         by flag_matched_pair_merge function.
-     time_difference: int
-        Lookup distance for matched pairs.
-
+     kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
 
     Returns
     -------
@@ -66,17 +65,10 @@ def create_impute_flags(
 
     # TODO : similar conditions at cum imputation links
     df["fill_group"] = (
-        (
-            (
-                df[period] - pd.DateOffset(months=time_difference)
-                != df.shift(time_difference)[period]
-            )
-            | (df[strata].diff(time_difference) != 0)
-            | (df[reference].diff(time_difference) != 0)
-        )
-        .astype("int")
-        .cumsum()
-    )
+        (df[period] - pd.DateOffset(months=1) != df.shift(1)[period])
+        | (df[strata].diff(1) != 0)
+        | (df[reference].diff(1) != 0)
+    ).cumsum()
 
     df[forward_target_roll] = df.groupby([reference, strata, "fill_group"])[
         "f_predictive_" + target