Merge pull request #65 from ONSdigital/419-test-outliering-with-anon-…

…data 419-test-outliering-with-anon-data
ONSdigital · Aug 2, 2024 · 5d69cfe · 5d69cfe
2 parents 0559c73 + cee161d
commit 5d69cfe
Show file tree

Hide file tree

Showing 17 changed files with 185 additions and 192 deletions.
diff --git a/mbs_results/calculate_predicted_unit_value.py b/mbs_results/calculate_predicted_unit_value.py
@@ -1,29 +1,16 @@
-import pandas as pd
+import numpy as np
 
 
 def calculate_predicted_unit_value(
-    df: pd.DataFrame,
-    period: str,
-    strata: str,
-    aux: str,
-    sampled: str,
-    a_weight: str,
-    target_variable: str,
-    nw_ag_flag: str,
-) -> pd.DataFrame:
+    df, aux, sampled, a_weight, target_variable, nw_ag_flag
+):
     """
-    Calculate link between target_variable and predictive_variable by strata,
-    a match_col must be supplied which indicates if target_variable
-    and predictive_variable can be linked.
+    Calculate predicted unit value
 
     Parameters
     ----------
     df : pd.Dataframe
         Original dataframe.
-    period : str
-        Column name containing time period.
-    strata : str
-        Column name containing strata information (sic).
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
@@ -34,22 +21,26 @@ def calculate_predicted_unit_value(
         Column name of the predicted target variable.
     nw_ag_flag: str
         column name indicating whether it can't be winsorised-
-        boolean (1 means it can't be winsorised, 0 means it can).
+        boolean (True means it can't be winsorised, False means it can).
     Returns
     -------
     df : pd.DataFrame
         A pandas DataFrame with a new column containing the predicted unit value.
     """
 
-    df = df.loc[(df["sampled"] == 1) & (df["nw_ag_flag"] == 0)]
-    df = df.reset_index(drop=True)
-    # check if reset index creates problems down the line
+    winsorised = (df[sampled] == 1) & (not df[nw_ag_flag] is True)
+    filtered_df = df.loc[winsorised]
 
-    sum_weighted_target_values = (df["a_weight"] * df["target_variable"]).sum()
-    sum_weighted_auxiliary_values = (df["a_weight"] * df["aux"]).sum()
+    sum_weighted_target_values = (
+        filtered_df[a_weight] * filtered_df[target_variable]
+    ).sum()
+    sum_weighted_auxiliary_values = (filtered_df[a_weight] * filtered_df[aux]).sum()
 
-    df["predicted_unit_value"] = df["aux"].apply(
+    df["predicted_unit_value"] = df[aux].apply(
         lambda x: x * (sum_weighted_target_values / sum_weighted_auxiliary_values)
     )
 
+    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
+    df["predicted_unit_value"] = df["predicted_unit_value"].mask(non_winsorised, np.nan)
+
     return df
diff --git a/mbs_results/calculate_ratio_estimation.py b/mbs_results/calculate_ratio_estimation.py
@@ -1,32 +1,24 @@
-import pandas as pd
+import numpy as np
 
 
 def calculate_ratio_estimation(
-    df: pd.DataFrame,
-    strata: str,
-    period: str,
-    aux: str,
-    sampled: str,
-    a_weight: str,
-    g_weight: str,
-    target_variable: str,
-    predicted_unit_value: str,
-    l_values: str,
-) -> pd.DataFrame:
+    df,
+    aux,
+    sampled,
+    a_weight,
+    g_weight,
+    target_variable,
+    predicted_unit_value,
+    l_values,
+    nw_ag_flag,
+):
 
     """
-    Calculate link between target_variable and predictive_variable by strata,
-    a match_col must be supplied which indicates if target_variable
-    and predictive_variable can be linked.
-
+    Calculate ratio estimation threshold
     Parameters
     ----------
     df : pd.Dataframe
         Original dataframe.
-    period : str
-        Column name containing time period.
-    strata : str
-        Column name containing strata information (sic).
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
@@ -41,20 +33,25 @@ def calculate_ratio_estimation(
         column name containing the predicted unit value
     l_values:str
         column containing the l values provided by methodology
+    nw_ag_flag: str
+        column name indicating whether it can't be winsorised-
+        boolean (True means it can't be winsorised, False means it can).
+
     Returns
     -------
     df : pd.DataFrame
-        A pandas DataFrame with a new column containing the predicted unit value.
+        A pandas DataFrame with a new column containing the ratio estimation.
     """
 
-    df = df[df["predicted_unit_value"].notna()]
-    df = df.reset_index(drop=True)
-    # check if reset index creates problems down the line
-
-    df["flag_calculation"] = df["a_weight"] * df["g_weight"]
-    df["ratio_estimation_treshold"] = df["predicted_unit_value"] + (
-        df["l_values"] / (df["flag_calculation"] - 1)
+    df["flag_calculation"] = df[a_weight] * df[g_weight]
+    df["ratio_estimation_treshold"] = (df[predicted_unit_value]) + (
+        df[l_values] / (df["flag_calculation"] - 1)
     )
     df = df.drop("flag_calculation", axis=1)
 
+    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
+    df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask(
+        non_winsorised, np.nan
+    )
+
     return df
diff --git a/mbs_results/calculate_winsorised_weight.py b/mbs_results/calculate_winsorised_weight.py
@@ -1,33 +1,32 @@
 import numpy as np
-import pandas as pd
 
 
 def calculate_winsorised_weight(
-    df: pd.DataFrame,
-    strata: str,
-    period: str,
-    aux: str,
-    sampled: str,
-    a_weight: str,
-    g_weight: str,
-    target_variable: str,
-    nw_ag_flag: str,
-    predicted_unit_value: str,
-    l_values: str,
-    ratio_estimation_treshold: str,
-) -> pd.DataFrame:
+    df,
+    strata,
+    period,
+    aux,
+    sampled,
+    a_weight,
+    g_weight,
+    target_variable,
+    predicted_unit_value,
+    l_values,
+    ratio_estimation_treshold,
+    nw_ag_flag,
+):
 
     """
     Calculate winsorised weight
 
     Parameters
     ----------
     df : pd.Dataframe
-        Original dataframe.
-    period : str
-        Column name containing time period.
+        Original dataframe
     strata : str
         Column name containing strata information (sic).
+    period : str
+        Column name containing time period.
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
@@ -38,38 +37,38 @@ def calculate_winsorised_weight(
         column name containing the g weight.
     target_variable : str
         Column name of the predicted target variable.
-    nw_ag_flag: str
-        column name indicating whether it can't be winsorised-
-        boolean (1 means it can't be winsorised, 0 means it can).
     predicted_unit_value: str
         column name containing the predicted unit value.
     l_values: str
         column name containing the l values as provided by methodology.
     ratio_estimation_treshold: str
         column name containing the previously calculated ratio estimation threshold.
+    nw_ag_flag: str
+        column name indicating whether it can't be winsorised-
+        boolean (True means it can't be winsorised, False means it can).
 
 
     Returns
     -------
     df : pd.DataFrame
-        A pandas DataFrame with a new column containing the predicted unit value.
+        A pandas DataFrame with a new column containing the winsorised weights.
     """
 
-    df = df[df["predicted_unit_value"].notna()]
-    df = df.reset_index(drop=True)
-    # check if reset index creates problems down the line
+    df["w"] = df[a_weight] * df[g_weight]
 
-    df["w"] = df["a_weight"] * df["g_weight"]
-
-    df["new_target"] = df["target_variable"] / df["w"] + (
-        df["ratio_estimation_treshold"] - (df["ratio_estimation_treshold"] / df["w"])
+    df["new_target"] = (df[target_variable] / df["w"]) + (
+        df[ratio_estimation_treshold] - (df[ratio_estimation_treshold] / df["w"])
     )
 
-    mask = df["target_variable"] <= df["ratio_estimation_treshold"]
-    df["new_target_variable"] = np.where(mask, df["target_variable"], df["new_target"])
+    mask = df[target_variable] <= df[ratio_estimation_treshold]
+    df["new_target_variable"] = np.where(mask, df[target_variable], df["new_target"])
 
-    df["outlier_weight"] = df["new_target_variable"] / df["target_variable"]
+    df["outlier_weight"] = df["new_target_variable"] / df[target_variable]
 
     df = df.drop(["w", "new_target"], axis=1)
 
+    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
+    df["outlier_weight"] = df["outlier_weight"].mask(non_winsorised, np.nan)
+    df["new_target_variable"] = df["new_target_variable"].mask(non_winsorised, np.nan)
+
     return df
diff --git a/mbs_results/flag_for_winsorisation.py b/mbs_results/flag_for_winsorisation.py
@@ -26,7 +26,7 @@ def winsorisation_flag(df, a_weight, g_weight):
 
     df["flag_calculation"] = df[a_weight] * df[g_weight]
 
-    df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: 1 if x <= 1 else 0)
+    df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: True if x <= 1 else 0)
 
     df = df.drop("flag_calculation", axis=1)
 

diff --git a/tests/data/winsorisation/flag_data.csv b/tests/data/winsorisation/flag_data.csv
@@ -1,16 +1,16 @@
 a_weight,g_weight,nw_ag_flag
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-0.32,0.004,1
-0.32,0.004,1
-,0.004,0
-,0.004,0
-0.5,2.0,1
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+0.32,0.004,True
+0.32,0.004,True
+,0.004,False
+,0.004,False
+0.5,2.0,True
diff --git a/tests/data/winsorisation/predicted_unit_data.csv b/tests/data/winsorisation/predicted_unit_data.csv
diff --git a/tests/data/winsorisation/predicted_unit_value.csv b/tests/data/winsorisation/predicted_unit_value.csv
diff --git a/tests/data/winsorisation/predicted_unit_value_data.csv b/tests/data/winsorisation/predicted_unit_value_data.csv
@@ -0,0 +1,15 @@
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
+101,202401,10,0,1.666666667,1.023809524,12,False
+101,202401,23,1,1.666666667,1.023809524,20,False
+101,202401,41,1,1.666666667,1.023809524,20,False
+101,202401,53,1,1.666666667,1.023809524,40,False
+101,202401,12,0,1.666666667,1.023809524,10,False
+102,202401,50,1,2.5,1.023809524,60,False
+102,202401,40,1,2.5,1.023809524,50,False
+102,202401,45,0,2.5,1.023809524,50,False
+102,202401,70,0,2.5,1.023809524,60,False
+102,202401,86,0,2.5,1.023809524,90,True
+103,202401,20,0,0.32,0.004,90,True
+103,202401,30,0,0.32,0.004,90,False
+104,202401,20,0,,0.004,90,False
+104,202401,30,0,,0.004,90,True
diff --git a/tests/data/winsorisation/predicted_unit_value_output.csv b/tests/data/winsorisation/predicted_unit_value_output.csv
@@ -1,6 +1,15 @@
 strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
-101,202312,23,1,1.666666667,1.023809524,20,0,22.361111
-101,202401,41,1,1.666666667,1.023809524,20,0,39.861111
-101,202401,53,1,1.666666667,1.023809524,40,0,51.527778
-102,202401,50,1,2.5,1.023809524,60,0,48.611111
-102,202401,40,1,2.5,1.023809524,50,0,38.888889
+101,202401,10,0,1.666666667,1.023809524,12,False,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778
+101,202401,12,0,1.666666667,1.023809524,10,False,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111
+102,202401,40,1,2.5,1.023809524,50,False,38.888889
+102,202401,45,0,2.5,1.023809524,50,False,
+102,202401,70,0,2.5,1.023809524,60,False,
+102,202401,86,0,2.5,1.023809524,90,True,
+103,202401,20,0,0.32,0.004,90,True,
+103,202401,30,0,0.32,0.004,90,False,
+104,202401,20,0,,0.004,90,False,
+104,202401,30,0,,0.004,90,True,
diff --git a/tests/data/winsorisation/ratio_estimation_data.csv b/tests/data/winsorisation/ratio_estimation_data.csv
@@ -1,13 +1,13 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,predicted_unit_value,l_values
-101,202401,10,0,1.666666667,1.023809524,12
-101,202312,23,1,1.666666667,1.023809524,20,22.361111,0.5
-101,202401,41,1,1.666666667,1.023809524,20,39.861111,0.5
-101,202401,53,1,1.666666667,1.023809524,40,51.527778,0.5
-101,202401,12,0,1.666666667,1.023809524,10
-102,202401,50,1,2.5,1.023809524,60,48.611111,0.5
-102,202401,40,1,2.5,1.023809524,50,38.888889,0.5
-102,202401,45,0,2.5,1.023809524,50
-102,202401,70,0,2.5,1.023809524,60
-102,202401,86,0,2.5,1.023809524,90
-104,202401,20,0,,0.004,90
-104,202401,30,0,,0.004,90
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
+101,202401,10,0,1.666666667,1.023809524,12,False,,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5
+101,202401,12,0,1.666666667,1.023809524,10,False,,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5
+102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5
+102,202401,45,0,2.5,1.023809524,50,True,,
+102,202401,70,0,2.5,1.023809524,60,True,,
+102,202401,86,0,2.5,1.023809524,90,False,,
+104,202401,20,0,,0.004,90,False,,
+104,202401,30,0,,0.004,90,True,,