From 6e16dd97f63680de418d951a5739c8f77a6aa2a3 Mon Sep 17 00:00:00 2001
From: giaccg <Giulia.Giaccaglia@ons.gov.uk>
Date: Thu, 1 Aug 2024 15:06:40 +0100
Subject: [PATCH 1/2] fixed winsorisation

---
 mbs_results/calculate_predicted_unit_value.py | 37 +++++--------
 mbs_results/calculate_ratio_estimation.py     | 49 ++++++++---------
 mbs_results/calculate_winsorised_weight.py    | 52 ++++++++++---------
 mbs_results/flag_for_winsorisation.py         |  2 +-
 tests/data/winsorisation/flag_data.csv        | 30 +++++------
 .../winsorisation/predicted_unit_data.csv     | 15 ------
 .../winsorisation/predicted_unit_value.csv    | 15 ------
 .../predicted_unit_value_data.csv             | 15 ++++++
 .../predicted_unit_value_output.csv           | 19 +++++--
 .../winsorisation/ratio_estimation_data.csv   | 26 +++++-----
 .../ratio_estimation_data_output.csv          | 19 ++++---
 .../winsorisation/winsorised_weight_data.csv  | 19 ++++---
 .../winsorised_weight_data_output.csv         | 19 ++++---
 tests/test_calculate_predicted_unit_value.py  | 15 ++----
 tests/test_calculate_ratio_estimation.py      | 16 +++---
 tests/test_calculate_winsorised_weight.py     | 12 ++---
 tests/test_flag_for_winsorisation.py          |  2 +-
 17 files changed, 181 insertions(+), 181 deletions(-)
 mode change 100755 => 100644 mbs_results/calculate_predicted_unit_value.py
 delete mode 100755 tests/data/winsorisation/predicted_unit_data.csv
 delete mode 100755 tests/data/winsorisation/predicted_unit_value.csv
 create mode 100755 tests/data/winsorisation/predicted_unit_value_data.csv

diff --git a/mbs_results/calculate_predicted_unit_value.py b/mbs_results/calculate_predicted_unit_value.py
old mode 100755
new mode 100644
index 9bf11845..d5ec1c88
--- a/mbs_results/calculate_predicted_unit_value.py
+++ b/mbs_results/calculate_predicted_unit_value.py
@@ -1,29 +1,16 @@
-import pandas as pd
+import numpy as np
 
 
 def calculate_predicted_unit_value(
-    df: pd.DataFrame,
-    period: str,
-    strata: str,
-    aux: str,
-    sampled: str,
-    a_weight: str,
-    target_variable: str,
-    nw_ag_flag: str,
-) -> pd.DataFrame:
+    df, aux, sampled, a_weight, target_variable, nw_ag_flag
+):
     """
-    Calculate link between target_variable and predictive_variable by strata,
-    a match_col must be supplied which indicates if target_variable
-    and predictive_variable can be linked.
+    Calculate predicted unit value
 
     Parameters
     ----------
     df : pd.Dataframe
         Original dataframe.
-    period : str
-        Column name containing time period.
-    strata : str
-        Column name containing strata information (sic).
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
@@ -41,15 +28,19 @@ def calculate_predicted_unit_value(
         A pandas DataFrame with a new column containing the predicted unit value.
     """
 
-    df = df.loc[(df["sampled"] == 1) & (df["nw_ag_flag"] == 0)]
-    df = df.reset_index(drop=True)
-    # check if reset index creates problems down the line
+    winsorised = (df[sampled] == 1) & (not df[nw_ag_flag] is True)
+    filtered_df = df.loc[winsorised]
 
-    sum_weighted_target_values = (df["a_weight"] * df["target_variable"]).sum()
-    sum_weighted_auxiliary_values = (df["a_weight"] * df["aux"]).sum()
+    sum_weighted_target_values = (
+        filtered_df[a_weight] * filtered_df[target_variable]
+    ).sum()
+    sum_weighted_auxiliary_values = (filtered_df[a_weight] * filtered_df[aux]).sum()
 
-    df["predicted_unit_value"] = df["aux"].apply(
+    df["predicted_unit_value"] = df[aux].apply(
         lambda x: x * (sum_weighted_target_values / sum_weighted_auxiliary_values)
     )
 
+    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
+    df["predicted_unit_value"] = df["predicted_unit_value"].mask(non_winsorised, np.nan)
+
     return df
diff --git a/mbs_results/calculate_ratio_estimation.py b/mbs_results/calculate_ratio_estimation.py
index 6e05f3e4..e0105420 100755
--- a/mbs_results/calculate_ratio_estimation.py
+++ b/mbs_results/calculate_ratio_estimation.py
@@ -1,24 +1,20 @@
-import pandas as pd
+import numpy as np
 
 
 def calculate_ratio_estimation(
-    df: pd.DataFrame,
-    strata: str,
-    period: str,
-    aux: str,
-    sampled: str,
-    a_weight: str,
-    g_weight: str,
-    target_variable: str,
-    predicted_unit_value: str,
-    l_values: str,
-) -> pd.DataFrame:
+    df,
+    aux,
+    sampled,
+    a_weight,
+    g_weight,
+    target_variable,
+    predicted_unit_value,
+    l_values,
+    nw_ag_flag,
+):
 
     """
-    Calculate link between target_variable and predictive_variable by strata,
-    a match_col must be supplied which indicates if target_variable
-    and predictive_variable can be linked.
-
+    Calculate ratio estimation threshold
     Parameters
     ----------
     df : pd.Dataframe
@@ -41,20 +37,25 @@ def calculate_ratio_estimation(
         column name containing the predicted unit value
     l_values:str
         column containing the l values provided by methodology
+    nw_ag_flag: str
+        column name indicating whether it can't be winsorised-
+        boolean (1 means it can't be winsorised, 0 means it can).
+
     Returns
     -------
     df : pd.DataFrame
-        A pandas DataFrame with a new column containing the predicted unit value.
+        A pandas DataFrame with a new column containing the ratio estimation.
     """
 
-    df = df[df["predicted_unit_value"].notna()]
-    df = df.reset_index(drop=True)
-    # check if reset index creates problems down the line
-
-    df["flag_calculation"] = df["a_weight"] * df["g_weight"]
-    df["ratio_estimation_treshold"] = df["predicted_unit_value"] + (
-        df["l_values"] / (df["flag_calculation"] - 1)
+    df["flag_calculation"] = df[a_weight] * df[g_weight]
+    df["ratio_estimation_treshold"] = (df[predicted_unit_value]) + (
+        df[l_values] / (df["flag_calculation"] - 1)
     )
     df = df.drop("flag_calculation", axis=1)
 
+    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
+    df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask(
+        non_winsorised, np.nan
+    )
+
     return df
diff --git a/mbs_results/calculate_winsorised_weight.py b/mbs_results/calculate_winsorised_weight.py
index d78689b5..b9980bac 100755
--- a/mbs_results/calculate_winsorised_weight.py
+++ b/mbs_results/calculate_winsorised_weight.py
@@ -1,21 +1,20 @@
 import numpy as np
-import pandas as pd
 
 
 def calculate_winsorised_weight(
-    df: pd.DataFrame,
-    strata: str,
-    period: str,
-    aux: str,
-    sampled: str,
-    a_weight: str,
-    g_weight: str,
-    target_variable: str,
-    nw_ag_flag: str,
-    predicted_unit_value: str,
-    l_values: str,
-    ratio_estimation_treshold: str,
-) -> pd.DataFrame:
+    df,
+    strata,
+    period,
+    aux,
+    sampled,
+    a_weight,
+    g_weight,
+    target_variable,
+    predicted_unit_value,
+    l_values,
+    ratio_estimation_treshold,
+    nw_ag_flag,
+):
 
     """
     Calculate winsorised weight
@@ -47,29 +46,32 @@ def calculate_winsorised_weight(
         column name containing the l values as provided by methodology.
     ratio_estimation_treshold: str
         column name containing the previously calculated ratio estimation threshold.
+    nw_ag_flag: str
+        column name indicating whether it can't be winsorised-
+        boolean (1 means it can't be winsorised, 0 means it can).
 
 
     Returns
     -------
     df : pd.DataFrame
-        A pandas DataFrame with a new column containing the predicted unit value.
+        A pandas DataFrame with a new column containing the winsorised weights.
     """
 
-    df = df[df["predicted_unit_value"].notna()]
-    df = df.reset_index(drop=True)
-    # check if reset index creates problems down the line
+    df["w"] = df[a_weight] * df[g_weight]
 
-    df["w"] = df["a_weight"] * df["g_weight"]
-
-    df["new_target"] = df["target_variable"] / df["w"] + (
-        df["ratio_estimation_treshold"] - (df["ratio_estimation_treshold"] / df["w"])
+    df["new_target"] = (df[target_variable] / df["w"]) + (
+        df[ratio_estimation_treshold] - (df[ratio_estimation_treshold] / df["w"])
     )
 
-    mask = df["target_variable"] <= df["ratio_estimation_treshold"]
-    df["new_target_variable"] = np.where(mask, df["target_variable"], df["new_target"])
+    mask = df[target_variable] <= df[ratio_estimation_treshold]
+    df["new_target_variable"] = np.where(mask, df[target_variable], df["new_target"])
 
-    df["outlier_weight"] = df["new_target_variable"] / df["target_variable"]
+    df["outlier_weight"] = df["new_target_variable"] / df[target_variable]
 
     df = df.drop(["w", "new_target"], axis=1)
 
+    non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
+    df["outlier_weight"] = df["outlier_weight"].mask(non_winsorised, np.nan)
+    df["new_target_variable"] = df["new_target_variable"].mask(non_winsorised, np.nan)
+
     return df
diff --git a/mbs_results/flag_for_winsorisation.py b/mbs_results/flag_for_winsorisation.py
index e2b34c19..dc88634a 100644
--- a/mbs_results/flag_for_winsorisation.py
+++ b/mbs_results/flag_for_winsorisation.py
@@ -26,7 +26,7 @@ def winsorisation_flag(df, a_weight, g_weight):
 
     df["flag_calculation"] = df[a_weight] * df[g_weight]
 
-    df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: 1 if x <= 1 else 0)
+    df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: True if x <= 1 else 0)
 
     df = df.drop("flag_calculation", axis=1)
 
diff --git a/tests/data/winsorisation/flag_data.csv b/tests/data/winsorisation/flag_data.csv
index ce500442..4808924f 100755
--- a/tests/data/winsorisation/flag_data.csv
+++ b/tests/data/winsorisation/flag_data.csv
@@ -1,16 +1,16 @@
 a_weight,g_weight,nw_ag_flag
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-1.666666667,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-2.5,1.023809524,0
-0.32,0.004,1
-0.32,0.004,1
-,0.004,0
-,0.004,0
-0.5,2.0,1
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+1.666666667,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+2.5,1.023809524,False
+0.32,0.004,True
+0.32,0.004,True
+,0.004,False
+,0.004,False
+0.5,2.0,True
diff --git a/tests/data/winsorisation/predicted_unit_data.csv b/tests/data/winsorisation/predicted_unit_data.csv
deleted file mode 100755
index 7eeae577..00000000
--- a/tests/data/winsorisation/predicted_unit_data.csv
+++ /dev/null
@@ -1,15 +0,0 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
-101,202401,10,0,1.666666667,1.023809524,12,0
-101,202401,23,1,1.666666667,1.023809524,20,0
-101,202401,41,1,1.666666667,1.023809524,20,0
-101,202401,53,1,1.666666667,1.023809524,40,0
-101,202401,12,0,1.666666667,1.023809524,10,0
-102,202401,50,1,2.5,1.023809524,60,0
-102,202401,40,1,2.5,1.023809524,50,0
-102,202401,45,0,2.5,1.023809524,50,0
-102,202401,70,0,2.5,1.023809524,60,0
-102,202401,86,0,2.5,1.023809524,90,0
-103,202401,20,0,0.32,0.004,90,1
-103,202401,30,0,0.32,0.004,90,1
-104,202401,20,0,,0.004,90,0
-104,202401,30,0,,0.004,90,0
diff --git a/tests/data/winsorisation/predicted_unit_value.csv b/tests/data/winsorisation/predicted_unit_value.csv
deleted file mode 100755
index a9036212..00000000
--- a/tests/data/winsorisation/predicted_unit_value.csv
+++ /dev/null
@@ -1,15 +0,0 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
-101,202401,10,0,1.666666667,1.023809524,12,0
-101,202312,23,1,1.666666667,1.023809524,20,0,22.361111
-101,202401,41,1,1.666666667,1.023809524,20,0,39.861111
-101,202401,53,1,1.666666667,1.023809524,40,0,51.527778
-101,202401,12,0,1.666666667,1.023809524,10,0
-102,202401,50,1,2.5,1.023809524,60,0,48.611111
-102,202401,40,1,2.5,1.023809524,50,0,38.888889
-102,202401,45,0,2.5,1.023809524,50,0
-102,202401,70,0,2.5,1.023809524,60,0
-102,202401,86,0,2.5,1.023809524,90,0
-103,202401,20,0,0.32,0.004,90,1
-103,202401,30,0,0.32,0.004,90,1
-104,202401,20,0,,0.004,90,0
-104,202401,30,0,,0.004,90,0
diff --git a/tests/data/winsorisation/predicted_unit_value_data.csv b/tests/data/winsorisation/predicted_unit_value_data.csv
new file mode 100755
index 00000000..bb513185
--- /dev/null
+++ b/tests/data/winsorisation/predicted_unit_value_data.csv
@@ -0,0 +1,15 @@
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
+101,202401,10,0,1.666666667,1.023809524,12,False
+101,202401,23,1,1.666666667,1.023809524,20,False
+101,202401,41,1,1.666666667,1.023809524,20,False
+101,202401,53,1,1.666666667,1.023809524,40,False
+101,202401,12,0,1.666666667,1.023809524,10,False
+102,202401,50,1,2.5,1.023809524,60,False
+102,202401,40,1,2.5,1.023809524,50,False
+102,202401,45,0,2.5,1.023809524,50,False
+102,202401,70,0,2.5,1.023809524,60,False
+102,202401,86,0,2.5,1.023809524,90,True
+103,202401,20,0,0.32,0.004,90,True
+103,202401,30,0,0.32,0.004,90,False
+104,202401,20,0,,0.004,90,False
+104,202401,30,0,,0.004,90,True
diff --git a/tests/data/winsorisation/predicted_unit_value_output.csv b/tests/data/winsorisation/predicted_unit_value_output.csv
index b4236126..c26a3373 100755
--- a/tests/data/winsorisation/predicted_unit_value_output.csv
+++ b/tests/data/winsorisation/predicted_unit_value_output.csv
@@ -1,6 +1,15 @@
 strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
-101,202312,23,1,1.666666667,1.023809524,20,0,22.361111
-101,202401,41,1,1.666666667,1.023809524,20,0,39.861111
-101,202401,53,1,1.666666667,1.023809524,40,0,51.527778
-102,202401,50,1,2.5,1.023809524,60,0,48.611111
-102,202401,40,1,2.5,1.023809524,50,0,38.888889
+101,202401,10,0,1.666666667,1.023809524,12,False,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778
+101,202401,12,0,1.666666667,1.023809524,10,False,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111
+102,202401,40,1,2.5,1.023809524,50,False,38.888889
+102,202401,45,0,2.5,1.023809524,50,False,
+102,202401,70,0,2.5,1.023809524,60,False,
+102,202401,86,0,2.5,1.023809524,90,True,
+103,202401,20,0,0.32,0.004,90,True,
+103,202401,30,0,0.32,0.004,90,False,
+104,202401,20,0,,0.004,90,False,
+104,202401,30,0,,0.004,90,True,
diff --git a/tests/data/winsorisation/ratio_estimation_data.csv b/tests/data/winsorisation/ratio_estimation_data.csv
index 39a5f069..ccc8485e 100755
--- a/tests/data/winsorisation/ratio_estimation_data.csv
+++ b/tests/data/winsorisation/ratio_estimation_data.csv
@@ -1,13 +1,13 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,predicted_unit_value,l_values
-101,202401,10,0,1.666666667,1.023809524,12
-101,202312,23,1,1.666666667,1.023809524,20,22.361111,0.5
-101,202401,41,1,1.666666667,1.023809524,20,39.861111,0.5
-101,202401,53,1,1.666666667,1.023809524,40,51.527778,0.5
-101,202401,12,0,1.666666667,1.023809524,10
-102,202401,50,1,2.5,1.023809524,60,48.611111,0.5
-102,202401,40,1,2.5,1.023809524,50,38.888889,0.5
-102,202401,45,0,2.5,1.023809524,50
-102,202401,70,0,2.5,1.023809524,60
-102,202401,86,0,2.5,1.023809524,90
-104,202401,20,0,,0.004,90
-104,202401,30,0,,0.004,90
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
+101,202401,10,0,1.666666667,1.023809524,12,False,,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5
+101,202401,12,0,1.666666667,1.023809524,10,False,,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5
+102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5
+102,202401,45,0,2.5,1.023809524,50,True,,
+102,202401,70,0,2.5,1.023809524,60,True,,
+102,202401,86,0,2.5,1.023809524,90,False,,
+104,202401,20,0,,0.004,90,False,,
+104,202401,30,0,,0.004,90,True,,
diff --git a/tests/data/winsorisation/ratio_estimation_data_output.csv b/tests/data/winsorisation/ratio_estimation_data_output.csv
index ca0d5b54..9c4e4c78 100755
--- a/tests/data/winsorisation/ratio_estimation_data_output.csv
+++ b/tests/data/winsorisation/ratio_estimation_data_output.csv
@@ -1,6 +1,13 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,predicted_unit_value,l_values,ratio_estimation_treshold
-101,202312,23,1,1.666666667,1.023809524,20,22.361111,0.5,23.068976
-101,202401,41,1,1.666666667,1.023809524,20,39.861111,0.5,40.568976
-101,202401,53,1,1.666666667,1.023809524,40,51.527778,0.5,52.235643
-102,202401,50,1,2.5,1.023809524,60,48.611111,0.5,48.931722
-102,202401,40,1,2.5,1.023809524,50,38.888889,0.5,39.209500
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
+101,202401,10,0,1.666666667,1.023809524,12,False,,,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643
+101,202401,12,0,1.666666667,1.023809524,10,False,,,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722
+102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500
+102,202401,45,0,2.5,1.023809524,50,True,,,
+102,202401,70,0,2.5,1.023809524,60,True,,,
+102,202401,86,0,2.5,1.023809524,90,False,,,
+104,202401,20,0,,0.004,90,False,,,
+104,202401,30,0,,0.004,90,True,,,
diff --git a/tests/data/winsorisation/winsorised_weight_data.csv b/tests/data/winsorisation/winsorised_weight_data.csv
index 84706398..9c4e4c78 100755
--- a/tests/data/winsorisation/winsorised_weight_data.csv
+++ b/tests/data/winsorisation/winsorised_weight_data.csv
@@ -1,6 +1,13 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_values,ratio_estimation_treshold
-101,202312,23,1,1.666666667,1.023809524,20,0,22.361111,0.5, 23.0689763
-101,202401,41,1,1.666666667,1.023809524,20,0,39.861111,0.5,40.5689763
-101,202401,53,1,1.666666667,1.023809524,40,0,51.527778,0.5,52.2356429
-102,202401,50,1,2.5,1.023809524,60,0,48.611111,0.5,48.9317218
-102,202401,40,1,2.5,1.023809524,50,0,38.888889,0.5,39.2094996
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
+101,202401,10,0,1.666666667,1.023809524,12,False,,,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643
+101,202401,12,0,1.666666667,1.023809524,10,False,,,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722
+102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500
+102,202401,45,0,2.5,1.023809524,50,True,,,
+102,202401,70,0,2.5,1.023809524,60,True,,,
+102,202401,86,0,2.5,1.023809524,90,False,,,
+104,202401,20,0,,0.004,90,False,,,
+104,202401,30,0,,0.004,90,True,,,
diff --git a/tests/data/winsorisation/winsorised_weight_data_output.csv b/tests/data/winsorisation/winsorised_weight_data_output.csv
index 06178971..39bd3a7c 100755
--- a/tests/data/winsorisation/winsorised_weight_data_output.csv
+++ b/tests/data/winsorisation/winsorised_weight_data_output.csv
@@ -1,6 +1,13 @@
-strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_values,ratio_estimation_treshold,new_target_variable,outlier_weight
-101,202312,23,1,1.666666667,1.023809524,20,0,22.361111,0.5, 23.0689763, 20.0000, 1.00000
-101,202401,41,1,1.666666667,1.023809524,20,0,39.861111,0.5,40.5689763, 20.00000, 1.00000
-101,202401,53,1,1.666666667,1.023809524,40,0,51.527778,0.5,52.2356429, 40.00000,1.00000
-102,202401,50,1,2.5,1.023809524,60,0,48.611111,0.5,48.9317218, 53.256072,0.887601
-102,202401,40,1,2.5,1.023809524,50,0,38.888889,0.5,39.2094996, 43.425323,0.868506
+strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
+101,202401,10,0,1.666666667,1.023809524,12,False,,,,,
+101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976,20.0000, 1.00000
+101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976,20.00000, 1.00000
+101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643,40.00000,1.00000
+101,202401,12,0,1.666666667,1.023809524,10,False,,,,,
+102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722,53.256072,0.887601
+102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500,43.425323,0.868506
+102,202401,45,0,2.5,1.023809524,50,True,,,,,
+102,202401,70,0,2.5,1.023809524,60,True,,,,,
+102,202401,86,0,2.5,1.023809524,90,False,,,,,
+104,202401,20,0,,0.004,90,False,,,,,
+104,202401,30,0,,0.004,90,True,,,,,
diff --git a/tests/test_calculate_predicted_unit_value.py b/tests/test_calculate_predicted_unit_value.py
index fe814c80..2bdc493b 100755
--- a/tests/test_calculate_predicted_unit_value.py
+++ b/tests/test_calculate_predicted_unit_value.py
@@ -10,7 +10,7 @@
 @pytest.fixture(scope="class")
 def predicted_unit_value_test_data():
     return pd.read_csv(
-        Path("tests") / "data" / "winsorisation" / "predicted_unit_value.csv",
+        Path("tests") / "data" / "winsorisation" / "predicted_unit_value_data.csv",
         low_memory=False,
         usecols=lambda c: not c.startswith("Unnamed:"),
     )
@@ -27,12 +27,10 @@ def predicted_unit_value_test_output():
 
 class TestPredictedUnitValue:
     def test_calculate_predicted_unit_value(
-        self, predicted_unit_value_test_output, predicted_unit_value_test_data
+        self, predicted_unit_value_test_data, predicted_unit_value_test_output
     ):
         expected_output = predicted_unit_value_test_output[
             [
-                "period",
-                "strata",
                 "aux",
                 "sampled",
                 "a_weight",
@@ -43,25 +41,20 @@ def test_calculate_predicted_unit_value(
         ]
         input_data = predicted_unit_value_test_data[
             [
-                "period",
-                "strata",
                 "aux",
                 "sampled",
                 "a_weight",
                 "target_variable",
                 "nw_ag_flag",
-                "predicted_unit_value",
             ]
         ]
-        input_data = input_data.drop(columns=["predicted_unit_value"])
+
         actual_output = calculate_predicted_unit_value(
             input_data,
-            "period",
-            "strata",
             "aux",
             "sampled",
             "a_weight",
-            "target_variale",
+            "target_variable",
             "nw_ag_flag",
         )
 
diff --git a/tests/test_calculate_ratio_estimation.py b/tests/test_calculate_ratio_estimation.py
index e99c2a2d..c7087c29 100755
--- a/tests/test_calculate_ratio_estimation.py
+++ b/tests/test_calculate_ratio_estimation.py
@@ -31,43 +31,41 @@ def test_calculate_ratio_estimation(
     ):
         expected_output = ratio_estimation_test_output[
             [
-                "strata",
-                "period",
                 "aux",
                 "sampled",
                 "a_weight",
                 "g_weight",
                 "target_variable",
                 "predicted_unit_value",
-                "l_values",
+                "l_value",
+                "nw_ag_flag",
                 "ratio_estimation_treshold",
             ]
         ]
+
         input_data = ratio_estimation_test_data[
             [
-                "strata",
-                "period",
                 "aux",
                 "sampled",
                 "a_weight",
                 "g_weight",
                 "target_variable",
                 "predicted_unit_value",
-                "l_values",
+                "l_value",
+                "nw_ag_flag",
             ]
         ]
 
         actual_output = calculate_ratio_estimation(
             input_data,
-            "strata",
-            "period",
             "aux",
             "sampled",
             "a_weight",
             "g_weight",
             "target_variale",
             "predicted_unit_value",
-            "l_values",
+            "l_value",
+            "nw_ag_flag",
         )
 
         assert_frame_equal(actual_output, expected_output)
diff --git a/tests/test_calculate_winsorised_weight.py b/tests/test_calculate_winsorised_weight.py
index 546b1377..f9268d5c 100755
--- a/tests/test_calculate_winsorised_weight.py
+++ b/tests/test_calculate_winsorised_weight.py
@@ -38,10 +38,10 @@ def test_winsorised_weight(
                 "a_weight",
                 "g_weight",
                 "target_variable",
-                "nw_ag_flag",
                 "predicted_unit_value",
-                "l_values",
+                "l_value",
                 "ratio_estimation_treshold",
+                "nw_ag_flag",
                 "new_target_variable",
                 "outlier_weight",
             ]
@@ -55,10 +55,10 @@ def test_winsorised_weight(
                 "a_weight",
                 "g_weight",
                 "target_variable",
-                "nw_ag_flag",
                 "predicted_unit_value",
-                "l_values",
+                "l_value",
                 "ratio_estimation_treshold",
+                "nw_ag_flag",
             ]
         ]
 
@@ -71,10 +71,10 @@ def test_winsorised_weight(
             "a_weight",
             "g_weight",
             "target_variable",
-            "nw_ag_flag",
             "predicted_unit_value",
-            "l_values",
+            "l_value",
             "ratio_estimation_treshold",
+            "nw_ag_flag",
         )
 
         assert_frame_equal(actual_output, expected_output)
diff --git a/tests/test_flag_for_winsorisation.py b/tests/test_flag_for_winsorisation.py
index 9dff320b..849567b5 100755
--- a/tests/test_flag_for_winsorisation.py
+++ b/tests/test_flag_for_winsorisation.py
@@ -28,4 +28,4 @@ def test_winsorisation_flag(self, winsorisation_flag_test_data):
             df=df_input, a_weight="a_weight", g_weight="g_weight"
         )
 
-        assert_frame_equal(df_output, df_expected_output)
+        assert_frame_equal(df_output, df_expected_output, check_dtype=False)

From cee161d1ce7ca0e1b340f00c30645c368186df68 Mon Sep 17 00:00:00 2001
From: giaccg <Giulia.Giaccaglia@ons.gov.uk>
Date: Fri, 2 Aug 2024 11:39:34 +0100
Subject: [PATCH 2/2] fixed docs

---
 mbs_results/calculate_predicted_unit_value.py |  2 +-
 mbs_results/calculate_ratio_estimation.py     |  6 +-----
 mbs_results/calculate_winsorised_weight.py    | 11 ++++-------
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/mbs_results/calculate_predicted_unit_value.py b/mbs_results/calculate_predicted_unit_value.py
index d5ec1c88..6b927241 100644
--- a/mbs_results/calculate_predicted_unit_value.py
+++ b/mbs_results/calculate_predicted_unit_value.py
@@ -21,7 +21,7 @@ def calculate_predicted_unit_value(
         Column name of the predicted target variable.
     nw_ag_flag: str
         column name indicating whether it can't be winsorised-
-        boolean (1 means it can't be winsorised, 0 means it can).
+        boolean (True means it can't be winsorised, False means it can).
     Returns
     -------
     df : pd.DataFrame
diff --git a/mbs_results/calculate_ratio_estimation.py b/mbs_results/calculate_ratio_estimation.py
index e0105420..d0d6426e 100755
--- a/mbs_results/calculate_ratio_estimation.py
+++ b/mbs_results/calculate_ratio_estimation.py
@@ -19,10 +19,6 @@ def calculate_ratio_estimation(
     ----------
     df : pd.Dataframe
         Original dataframe.
-    period : str
-        Column name containing time period.
-    strata : str
-        Column name containing strata information (sic).
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
@@ -39,7 +35,7 @@ def calculate_ratio_estimation(
         column containing the l values provided by methodology
     nw_ag_flag: str
         column name indicating whether it can't be winsorised-
-        boolean (1 means it can't be winsorised, 0 means it can).
+        boolean (True means it can't be winsorised, False means it can).
 
     Returns
     -------
diff --git a/mbs_results/calculate_winsorised_weight.py b/mbs_results/calculate_winsorised_weight.py
index b9980bac..f3bebbec 100755
--- a/mbs_results/calculate_winsorised_weight.py
+++ b/mbs_results/calculate_winsorised_weight.py
@@ -22,11 +22,11 @@ def calculate_winsorised_weight(
     Parameters
     ----------
     df : pd.Dataframe
-        Original dataframe.
-    period : str
-        Column name containing time period.
+        Original dataframe
     strata : str
         Column name containing strata information (sic).
+    period : str
+        Column name containing time period.
     aux : str
         Column name containing auxiliary variable (x).
     sampled : str
@@ -37,9 +37,6 @@ def calculate_winsorised_weight(
         column name containing the g weight.
     target_variable : str
         Column name of the predicted target variable.
-    nw_ag_flag: str
-        column name indicating whether it can't be winsorised-
-        boolean (1 means it can't be winsorised, 0 means it can).
     predicted_unit_value: str
         column name containing the predicted unit value.
     l_values: str
@@ -48,7 +45,7 @@ def calculate_winsorised_weight(
         column name containing the previously calculated ratio estimation threshold.
     nw_ag_flag: str
         column name indicating whether it can't be winsorised-
-        boolean (1 means it can't be winsorised, 0 means it can).
+        boolean (True means it can't be winsorised, False means it can).
 
 
     Returns