Skip to content

Commit

Permalink
Merge pull request #65 from ONSdigital/419-test-outliering-with-anon-…
Browse files Browse the repository at this point in the history
…data

419-test-outliering-with-anon-data
  • Loading branch information
AntonZogk authored Aug 2, 2024
2 parents 0559c73 + cee161d commit 5d69cfe
Show file tree
Hide file tree
Showing 17 changed files with 185 additions and 192 deletions.
39 changes: 15 additions & 24 deletions mbs_results/calculate_predicted_unit_value.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,29 +1,16 @@
import pandas as pd
import numpy as np


def calculate_predicted_unit_value(
df: pd.DataFrame,
period: str,
strata: str,
aux: str,
sampled: str,
a_weight: str,
target_variable: str,
nw_ag_flag: str,
) -> pd.DataFrame:
df, aux, sampled, a_weight, target_variable, nw_ag_flag
):
"""
Calculate link between target_variable and predictive_variable by strata,
a match_col must be supplied which indicates if target_variable
and predictive_variable can be linked.
Calculate predicted unit value
Parameters
----------
df : pd.Dataframe
Original dataframe.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
aux : str
Column name containing auxiliary variable (x).
sampled : str
Expand All @@ -34,22 +21,26 @@ def calculate_predicted_unit_value(
Column name of the predicted target variable.
nw_ag_flag: str
column name indicating whether it can't be winsorised-
boolean (1 means it can't be winsorised, 0 means it can).
boolean (True means it can't be winsorised, False means it can).
Returns
-------
df : pd.DataFrame
A pandas DataFrame with a new column containing the predicted unit value.
"""

df = df.loc[(df["sampled"] == 1) & (df["nw_ag_flag"] == 0)]
df = df.reset_index(drop=True)
# check if reset index creates problems down the line
winsorised = (df[sampled] == 1) & (not df[nw_ag_flag] is True)
filtered_df = df.loc[winsorised]

sum_weighted_target_values = (df["a_weight"] * df["target_variable"]).sum()
sum_weighted_auxiliary_values = (df["a_weight"] * df["aux"]).sum()
sum_weighted_target_values = (
filtered_df[a_weight] * filtered_df[target_variable]
).sum()
sum_weighted_auxiliary_values = (filtered_df[a_weight] * filtered_df[aux]).sum()

df["predicted_unit_value"] = df["aux"].apply(
df["predicted_unit_value"] = df[aux].apply(
lambda x: x * (sum_weighted_target_values / sum_weighted_auxiliary_values)
)

non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
df["predicted_unit_value"] = df["predicted_unit_value"].mask(non_winsorised, np.nan)

return df
53 changes: 25 additions & 28 deletions mbs_results/calculate_ratio_estimation.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,24 @@
import pandas as pd
import numpy as np


def calculate_ratio_estimation(
df: pd.DataFrame,
strata: str,
period: str,
aux: str,
sampled: str,
a_weight: str,
g_weight: str,
target_variable: str,
predicted_unit_value: str,
l_values: str,
) -> pd.DataFrame:
df,
aux,
sampled,
a_weight,
g_weight,
target_variable,
predicted_unit_value,
l_values,
nw_ag_flag,
):

"""
Calculate link between target_variable and predictive_variable by strata,
a match_col must be supplied which indicates if target_variable
and predictive_variable can be linked.
Calculate ratio estimation threshold
Parameters
----------
df : pd.Dataframe
Original dataframe.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
aux : str
Column name containing auxiliary variable (x).
sampled : str
Expand All @@ -41,20 +33,25 @@ def calculate_ratio_estimation(
column name containing the predicted unit value
l_values:str
column containing the l values provided by methodology
nw_ag_flag: str
column name indicating whether it can't be winsorised-
boolean (True means it can't be winsorised, False means it can).
Returns
-------
df : pd.DataFrame
A pandas DataFrame with a new column containing the predicted unit value.
A pandas DataFrame with a new column containing the ratio estimation.
"""

df = df[df["predicted_unit_value"].notna()]
df = df.reset_index(drop=True)
# check if reset index creates problems down the line

df["flag_calculation"] = df["a_weight"] * df["g_weight"]
df["ratio_estimation_treshold"] = df["predicted_unit_value"] + (
df["l_values"] / (df["flag_calculation"] - 1)
df["flag_calculation"] = df[a_weight] * df[g_weight]
df["ratio_estimation_treshold"] = (df[predicted_unit_value]) + (
df[l_values] / (df["flag_calculation"] - 1)
)
df = df.drop("flag_calculation", axis=1)

non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask(
non_winsorised, np.nan
)

return df
61 changes: 30 additions & 31 deletions mbs_results/calculate_winsorised_weight.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,32 @@
import numpy as np
import pandas as pd


def calculate_winsorised_weight(
df: pd.DataFrame,
strata: str,
period: str,
aux: str,
sampled: str,
a_weight: str,
g_weight: str,
target_variable: str,
nw_ag_flag: str,
predicted_unit_value: str,
l_values: str,
ratio_estimation_treshold: str,
) -> pd.DataFrame:
df,
strata,
period,
aux,
sampled,
a_weight,
g_weight,
target_variable,
predicted_unit_value,
l_values,
ratio_estimation_treshold,
nw_ag_flag,
):

"""
Calculate winsorised weight
Parameters
----------
df : pd.Dataframe
Original dataframe.
period : str
Column name containing time period.
Original dataframe
strata : str
Column name containing strata information (sic).
period : str
Column name containing time period.
aux : str
Column name containing auxiliary variable (x).
sampled : str
Expand All @@ -38,38 +37,38 @@ def calculate_winsorised_weight(
column name containing the g weight.
target_variable : str
Column name of the predicted target variable.
nw_ag_flag: str
column name indicating whether it can't be winsorised-
boolean (1 means it can't be winsorised, 0 means it can).
predicted_unit_value: str
column name containing the predicted unit value.
l_values: str
column name containing the l values as provided by methodology.
ratio_estimation_treshold: str
column name containing the previously calculated ratio estimation threshold.
nw_ag_flag: str
column name indicating whether it can't be winsorised-
boolean (True means it can't be winsorised, False means it can).
Returns
-------
df : pd.DataFrame
A pandas DataFrame with a new column containing the predicted unit value.
A pandas DataFrame with a new column containing the winsorised weights.
"""

df = df[df["predicted_unit_value"].notna()]
df = df.reset_index(drop=True)
# check if reset index creates problems down the line
df["w"] = df[a_weight] * df[g_weight]

df["w"] = df["a_weight"] * df["g_weight"]

df["new_target"] = df["target_variable"] / df["w"] + (
df["ratio_estimation_treshold"] - (df["ratio_estimation_treshold"] / df["w"])
df["new_target"] = (df[target_variable] / df["w"]) + (
df[ratio_estimation_treshold] - (df[ratio_estimation_treshold] / df["w"])
)

mask = df["target_variable"] <= df["ratio_estimation_treshold"]
df["new_target_variable"] = np.where(mask, df["target_variable"], df["new_target"])
mask = df[target_variable] <= df[ratio_estimation_treshold]
df["new_target_variable"] = np.where(mask, df[target_variable], df["new_target"])

df["outlier_weight"] = df["new_target_variable"] / df["target_variable"]
df["outlier_weight"] = df["new_target_variable"] / df[target_variable]

df = df.drop(["w", "new_target"], axis=1)

non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True)
df["outlier_weight"] = df["outlier_weight"].mask(non_winsorised, np.nan)
df["new_target_variable"] = df["new_target_variable"].mask(non_winsorised, np.nan)

return df
2 changes: 1 addition & 1 deletion mbs_results/flag_for_winsorisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def winsorisation_flag(df, a_weight, g_weight):

df["flag_calculation"] = df[a_weight] * df[g_weight]

df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: 1 if x <= 1 else 0)
df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: True if x <= 1 else 0)

df = df.drop("flag_calculation", axis=1)

Expand Down
30 changes: 15 additions & 15 deletions tests/data/winsorisation/flag_data.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
a_weight,g_weight,nw_ag_flag
1.666666667,1.023809524,0
1.666666667,1.023809524,0
1.666666667,1.023809524,0
1.666666667,1.023809524,0
1.666666667,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
0.32,0.004,1
0.32,0.004,1
,0.004,0
,0.004,0
0.5,2.0,1
1.666666667,1.023809524,False
1.666666667,1.023809524,False
1.666666667,1.023809524,False
1.666666667,1.023809524,False
1.666666667,1.023809524,False
2.5,1.023809524,False
2.5,1.023809524,False
2.5,1.023809524,False
2.5,1.023809524,False
2.5,1.023809524,False
0.32,0.004,True
0.32,0.004,True
,0.004,False
,0.004,False
0.5,2.0,True
15 changes: 0 additions & 15 deletions tests/data/winsorisation/predicted_unit_data.csv

This file was deleted.

15 changes: 0 additions & 15 deletions tests/data/winsorisation/predicted_unit_value.csv

This file was deleted.

15 changes: 15 additions & 0 deletions tests/data/winsorisation/predicted_unit_value_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
101,202401,10,0,1.666666667,1.023809524,12,False
101,202401,23,1,1.666666667,1.023809524,20,False
101,202401,41,1,1.666666667,1.023809524,20,False
101,202401,53,1,1.666666667,1.023809524,40,False
101,202401,12,0,1.666666667,1.023809524,10,False
102,202401,50,1,2.5,1.023809524,60,False
102,202401,40,1,2.5,1.023809524,50,False
102,202401,45,0,2.5,1.023809524,50,False
102,202401,70,0,2.5,1.023809524,60,False
102,202401,86,0,2.5,1.023809524,90,True
103,202401,20,0,0.32,0.004,90,True
103,202401,30,0,0.32,0.004,90,False
104,202401,20,0,,0.004,90,False
104,202401,30,0,,0.004,90,True
19 changes: 14 additions & 5 deletions tests/data/winsorisation/predicted_unit_value_output.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
101,202312,23,1,1.666666667,1.023809524,20,0,22.361111
101,202401,41,1,1.666666667,1.023809524,20,0,39.861111
101,202401,53,1,1.666666667,1.023809524,40,0,51.527778
102,202401,50,1,2.5,1.023809524,60,0,48.611111
102,202401,40,1,2.5,1.023809524,50,0,38.888889
101,202401,10,0,1.666666667,1.023809524,12,False,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778
101,202401,12,0,1.666666667,1.023809524,10,False,
102,202401,50,1,2.5,1.023809524,60,False,48.611111
102,202401,40,1,2.5,1.023809524,50,False,38.888889
102,202401,45,0,2.5,1.023809524,50,False,
102,202401,70,0,2.5,1.023809524,60,False,
102,202401,86,0,2.5,1.023809524,90,True,
103,202401,20,0,0.32,0.004,90,True,
103,202401,30,0,0.32,0.004,90,False,
104,202401,20,0,,0.004,90,False,
104,202401,30,0,,0.004,90,True,
26 changes: 13 additions & 13 deletions tests/data/winsorisation/ratio_estimation_data.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,predicted_unit_value,l_values
101,202401,10,0,1.666666667,1.023809524,12
101,202312,23,1,1.666666667,1.023809524,20,22.361111,0.5
101,202401,41,1,1.666666667,1.023809524,20,39.861111,0.5
101,202401,53,1,1.666666667,1.023809524,40,51.527778,0.5
101,202401,12,0,1.666666667,1.023809524,10
102,202401,50,1,2.5,1.023809524,60,48.611111,0.5
102,202401,40,1,2.5,1.023809524,50,38.888889,0.5
102,202401,45,0,2.5,1.023809524,50
102,202401,70,0,2.5,1.023809524,60
102,202401,86,0,2.5,1.023809524,90
104,202401,20,0,,0.004,90
104,202401,30,0,,0.004,90
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
101,202401,10,0,1.666666667,1.023809524,12,False,,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5
101,202401,12,0,1.666666667,1.023809524,10,False,,
102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5
102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5
102,202401,45,0,2.5,1.023809524,50,True,,
102,202401,70,0,2.5,1.023809524,60,True,,
102,202401,86,0,2.5,1.023809524,90,False,,
104,202401,20,0,,0.004,90,False,,
104,202401,30,0,,0.004,90,True,,
Loading

0 comments on commit 5d69cfe

Please sign in to comment.