From 6e16dd97f63680de418d951a5739c8f77a6aa2a3 Mon Sep 17 00:00:00 2001 From: giaccg Date: Thu, 1 Aug 2024 15:06:40 +0100 Subject: [PATCH 1/2] fixed winsorisation --- mbs_results/calculate_predicted_unit_value.py | 37 +++++-------- mbs_results/calculate_ratio_estimation.py | 49 ++++++++--------- mbs_results/calculate_winsorised_weight.py | 52 ++++++++++--------- mbs_results/flag_for_winsorisation.py | 2 +- tests/data/winsorisation/flag_data.csv | 30 +++++------ .../winsorisation/predicted_unit_data.csv | 15 ------ .../winsorisation/predicted_unit_value.csv | 15 ------ .../predicted_unit_value_data.csv | 15 ++++++ .../predicted_unit_value_output.csv | 19 +++++-- .../winsorisation/ratio_estimation_data.csv | 26 +++++----- .../ratio_estimation_data_output.csv | 19 ++++--- .../winsorisation/winsorised_weight_data.csv | 19 ++++--- .../winsorised_weight_data_output.csv | 19 ++++--- tests/test_calculate_predicted_unit_value.py | 15 ++---- tests/test_calculate_ratio_estimation.py | 16 +++--- tests/test_calculate_winsorised_weight.py | 12 ++--- tests/test_flag_for_winsorisation.py | 2 +- 17 files changed, 181 insertions(+), 181 deletions(-) mode change 100755 => 100644 mbs_results/calculate_predicted_unit_value.py delete mode 100755 tests/data/winsorisation/predicted_unit_data.csv delete mode 100755 tests/data/winsorisation/predicted_unit_value.csv create mode 100755 tests/data/winsorisation/predicted_unit_value_data.csv diff --git a/mbs_results/calculate_predicted_unit_value.py b/mbs_results/calculate_predicted_unit_value.py old mode 100755 new mode 100644 index 9bf11845..d5ec1c88 --- a/mbs_results/calculate_predicted_unit_value.py +++ b/mbs_results/calculate_predicted_unit_value.py @@ -1,29 +1,16 @@ -import pandas as pd +import numpy as np def calculate_predicted_unit_value( - df: pd.DataFrame, - period: str, - strata: str, - aux: str, - sampled: str, - a_weight: str, - target_variable: str, - nw_ag_flag: str, -) -> pd.DataFrame: + df, aux, sampled, a_weight, target_variable, nw_ag_flag +): """ - Calculate link between target_variable and predictive_variable by strata, - a match_col must be supplied which indicates if target_variable - and predictive_variable can be linked. + Calculate predicted unit value Parameters ---------- df : pd.Dataframe Original dataframe. - period : str - Column name containing time period. - strata : str - Column name containing strata information (sic). aux : str Column name containing auxiliary variable (x). sampled : str @@ -41,15 +28,19 @@ def calculate_predicted_unit_value( A pandas DataFrame with a new column containing the predicted unit value. """ - df = df.loc[(df["sampled"] == 1) & (df["nw_ag_flag"] == 0)] - df = df.reset_index(drop=True) - # check if reset index creates problems down the line + winsorised = (df[sampled] == 1) & (not df[nw_ag_flag] is True) + filtered_df = df.loc[winsorised] - sum_weighted_target_values = (df["a_weight"] * df["target_variable"]).sum() - sum_weighted_auxiliary_values = (df["a_weight"] * df["aux"]).sum() + sum_weighted_target_values = ( + filtered_df[a_weight] * filtered_df[target_variable] + ).sum() + sum_weighted_auxiliary_values = (filtered_df[a_weight] * filtered_df[aux]).sum() - df["predicted_unit_value"] = df["aux"].apply( + df["predicted_unit_value"] = df[aux].apply( lambda x: x * (sum_weighted_target_values / sum_weighted_auxiliary_values) ) + non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True) + df["predicted_unit_value"] = df["predicted_unit_value"].mask(non_winsorised, np.nan) + return df diff --git a/mbs_results/calculate_ratio_estimation.py b/mbs_results/calculate_ratio_estimation.py index 6e05f3e4..e0105420 100755 --- a/mbs_results/calculate_ratio_estimation.py +++ b/mbs_results/calculate_ratio_estimation.py @@ -1,24 +1,20 @@ -import pandas as pd +import numpy as np def calculate_ratio_estimation( - df: pd.DataFrame, - strata: str, - period: str, - aux: str, - sampled: str, - a_weight: str, - g_weight: str, - target_variable: str, - predicted_unit_value: str, - l_values: str, -) -> pd.DataFrame: + df, + aux, + sampled, + a_weight, + g_weight, + target_variable, + predicted_unit_value, + l_values, + nw_ag_flag, +): """ - Calculate link between target_variable and predictive_variable by strata, - a match_col must be supplied which indicates if target_variable - and predictive_variable can be linked. - + Calculate ratio estimation threshold Parameters ---------- df : pd.Dataframe @@ -41,20 +37,25 @@ def calculate_ratio_estimation( column name containing the predicted unit value l_values:str column containing the l values provided by methodology + nw_ag_flag: str + column name indicating whether it can't be winsorised- + boolean (1 means it can't be winsorised, 0 means it can). + Returns ------- df : pd.DataFrame - A pandas DataFrame with a new column containing the predicted unit value. + A pandas DataFrame with a new column containing the ratio estimation. """ - df = df[df["predicted_unit_value"].notna()] - df = df.reset_index(drop=True) - # check if reset index creates problems down the line - - df["flag_calculation"] = df["a_weight"] * df["g_weight"] - df["ratio_estimation_treshold"] = df["predicted_unit_value"] + ( - df["l_values"] / (df["flag_calculation"] - 1) + df["flag_calculation"] = df[a_weight] * df[g_weight] + df["ratio_estimation_treshold"] = (df[predicted_unit_value]) + ( + df[l_values] / (df["flag_calculation"] - 1) ) df = df.drop("flag_calculation", axis=1) + non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True) + df["ratio_estimation_treshold"] = df["ratio_estimation_treshold"].mask( + non_winsorised, np.nan + ) + return df diff --git a/mbs_results/calculate_winsorised_weight.py b/mbs_results/calculate_winsorised_weight.py index d78689b5..b9980bac 100755 --- a/mbs_results/calculate_winsorised_weight.py +++ b/mbs_results/calculate_winsorised_weight.py @@ -1,21 +1,20 @@ import numpy as np -import pandas as pd def calculate_winsorised_weight( - df: pd.DataFrame, - strata: str, - period: str, - aux: str, - sampled: str, - a_weight: str, - g_weight: str, - target_variable: str, - nw_ag_flag: str, - predicted_unit_value: str, - l_values: str, - ratio_estimation_treshold: str, -) -> pd.DataFrame: + df, + strata, + period, + aux, + sampled, + a_weight, + g_weight, + target_variable, + predicted_unit_value, + l_values, + ratio_estimation_treshold, + nw_ag_flag, +): """ Calculate winsorised weight @@ -47,29 +46,32 @@ def calculate_winsorised_weight( column name containing the l values as provided by methodology. ratio_estimation_treshold: str column name containing the previously calculated ratio estimation threshold. + nw_ag_flag: str + column name indicating whether it can't be winsorised- + boolean (1 means it can't be winsorised, 0 means it can). Returns ------- df : pd.DataFrame - A pandas DataFrame with a new column containing the predicted unit value. + A pandas DataFrame with a new column containing the winsorised weights. """ - df = df[df["predicted_unit_value"].notna()] - df = df.reset_index(drop=True) - # check if reset index creates problems down the line + df["w"] = df[a_weight] * df[g_weight] - df["w"] = df["a_weight"] * df["g_weight"] - - df["new_target"] = df["target_variable"] / df["w"] + ( - df["ratio_estimation_treshold"] - (df["ratio_estimation_treshold"] / df["w"]) + df["new_target"] = (df[target_variable] / df["w"]) + ( + df[ratio_estimation_treshold] - (df[ratio_estimation_treshold] / df["w"]) ) - mask = df["target_variable"] <= df["ratio_estimation_treshold"] - df["new_target_variable"] = np.where(mask, df["target_variable"], df["new_target"]) + mask = df[target_variable] <= df[ratio_estimation_treshold] + df["new_target_variable"] = np.where(mask, df[target_variable], df["new_target"]) - df["outlier_weight"] = df["new_target_variable"] / df["target_variable"] + df["outlier_weight"] = df["new_target_variable"] / df[target_variable] df = df.drop(["w", "new_target"], axis=1) + non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] is True) + df["outlier_weight"] = df["outlier_weight"].mask(non_winsorised, np.nan) + df["new_target_variable"] = df["new_target_variable"].mask(non_winsorised, np.nan) + return df diff --git a/mbs_results/flag_for_winsorisation.py b/mbs_results/flag_for_winsorisation.py index e2b34c19..dc88634a 100644 --- a/mbs_results/flag_for_winsorisation.py +++ b/mbs_results/flag_for_winsorisation.py @@ -26,7 +26,7 @@ def winsorisation_flag(df, a_weight, g_weight): df["flag_calculation"] = df[a_weight] * df[g_weight] - df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: 1 if x <= 1 else 0) + df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: True if x <= 1 else 0) df = df.drop("flag_calculation", axis=1) diff --git a/tests/data/winsorisation/flag_data.csv b/tests/data/winsorisation/flag_data.csv index ce500442..4808924f 100755 --- a/tests/data/winsorisation/flag_data.csv +++ b/tests/data/winsorisation/flag_data.csv @@ -1,16 +1,16 @@ a_weight,g_weight,nw_ag_flag -1.666666667,1.023809524,0 -1.666666667,1.023809524,0 -1.666666667,1.023809524,0 -1.666666667,1.023809524,0 -1.666666667,1.023809524,0 -2.5,1.023809524,0 -2.5,1.023809524,0 -2.5,1.023809524,0 -2.5,1.023809524,0 -2.5,1.023809524,0 -0.32,0.004,1 -0.32,0.004,1 -,0.004,0 -,0.004,0 -0.5,2.0,1 +1.666666667,1.023809524,False +1.666666667,1.023809524,False +1.666666667,1.023809524,False +1.666666667,1.023809524,False +1.666666667,1.023809524,False +2.5,1.023809524,False +2.5,1.023809524,False +2.5,1.023809524,False +2.5,1.023809524,False +2.5,1.023809524,False +0.32,0.004,True +0.32,0.004,True +,0.004,False +,0.004,False +0.5,2.0,True diff --git a/tests/data/winsorisation/predicted_unit_data.csv b/tests/data/winsorisation/predicted_unit_data.csv deleted file mode 100755 index 7eeae577..00000000 --- a/tests/data/winsorisation/predicted_unit_data.csv +++ /dev/null @@ -1,15 +0,0 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag -101,202401,10,0,1.666666667,1.023809524,12,0 -101,202401,23,1,1.666666667,1.023809524,20,0 -101,202401,41,1,1.666666667,1.023809524,20,0 -101,202401,53,1,1.666666667,1.023809524,40,0 -101,202401,12,0,1.666666667,1.023809524,10,0 -102,202401,50,1,2.5,1.023809524,60,0 -102,202401,40,1,2.5,1.023809524,50,0 -102,202401,45,0,2.5,1.023809524,50,0 -102,202401,70,0,2.5,1.023809524,60,0 -102,202401,86,0,2.5,1.023809524,90,0 -103,202401,20,0,0.32,0.004,90,1 -103,202401,30,0,0.32,0.004,90,1 -104,202401,20,0,,0.004,90,0 -104,202401,30,0,,0.004,90,0 diff --git a/tests/data/winsorisation/predicted_unit_value.csv b/tests/data/winsorisation/predicted_unit_value.csv deleted file mode 100755 index a9036212..00000000 --- a/tests/data/winsorisation/predicted_unit_value.csv +++ /dev/null @@ -1,15 +0,0 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value -101,202401,10,0,1.666666667,1.023809524,12,0 -101,202312,23,1,1.666666667,1.023809524,20,0,22.361111 -101,202401,41,1,1.666666667,1.023809524,20,0,39.861111 -101,202401,53,1,1.666666667,1.023809524,40,0,51.527778 -101,202401,12,0,1.666666667,1.023809524,10,0 -102,202401,50,1,2.5,1.023809524,60,0,48.611111 -102,202401,40,1,2.5,1.023809524,50,0,38.888889 -102,202401,45,0,2.5,1.023809524,50,0 -102,202401,70,0,2.5,1.023809524,60,0 -102,202401,86,0,2.5,1.023809524,90,0 -103,202401,20,0,0.32,0.004,90,1 -103,202401,30,0,0.32,0.004,90,1 -104,202401,20,0,,0.004,90,0 -104,202401,30,0,,0.004,90,0 diff --git a/tests/data/winsorisation/predicted_unit_value_data.csv b/tests/data/winsorisation/predicted_unit_value_data.csv new file mode 100755 index 00000000..bb513185 --- /dev/null +++ b/tests/data/winsorisation/predicted_unit_value_data.csv @@ -0,0 +1,15 @@ +strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag +101,202401,10,0,1.666666667,1.023809524,12,False +101,202401,23,1,1.666666667,1.023809524,20,False +101,202401,41,1,1.666666667,1.023809524,20,False +101,202401,53,1,1.666666667,1.023809524,40,False +101,202401,12,0,1.666666667,1.023809524,10,False +102,202401,50,1,2.5,1.023809524,60,False +102,202401,40,1,2.5,1.023809524,50,False +102,202401,45,0,2.5,1.023809524,50,False +102,202401,70,0,2.5,1.023809524,60,False +102,202401,86,0,2.5,1.023809524,90,True +103,202401,20,0,0.32,0.004,90,True +103,202401,30,0,0.32,0.004,90,False +104,202401,20,0,,0.004,90,False +104,202401,30,0,,0.004,90,True diff --git a/tests/data/winsorisation/predicted_unit_value_output.csv b/tests/data/winsorisation/predicted_unit_value_output.csv index b4236126..c26a3373 100755 --- a/tests/data/winsorisation/predicted_unit_value_output.csv +++ b/tests/data/winsorisation/predicted_unit_value_output.csv @@ -1,6 +1,15 @@ strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value -101,202312,23,1,1.666666667,1.023809524,20,0,22.361111 -101,202401,41,1,1.666666667,1.023809524,20,0,39.861111 -101,202401,53,1,1.666666667,1.023809524,40,0,51.527778 -102,202401,50,1,2.5,1.023809524,60,0,48.611111 -102,202401,40,1,2.5,1.023809524,50,0,38.888889 +101,202401,10,0,1.666666667,1.023809524,12,False, +101,202312,23,1,1.666666667,1.023809524,20,False,22.361111 +101,202401,41,1,1.666666667,1.023809524,20,False,39.861111 +101,202401,53,1,1.666666667,1.023809524,40,False,51.527778 +101,202401,12,0,1.666666667,1.023809524,10,False, +102,202401,50,1,2.5,1.023809524,60,False,48.611111 +102,202401,40,1,2.5,1.023809524,50,False,38.888889 +102,202401,45,0,2.5,1.023809524,50,False, +102,202401,70,0,2.5,1.023809524,60,False, +102,202401,86,0,2.5,1.023809524,90,True, +103,202401,20,0,0.32,0.004,90,True, +103,202401,30,0,0.32,0.004,90,False, +104,202401,20,0,,0.004,90,False, +104,202401,30,0,,0.004,90,True, diff --git a/tests/data/winsorisation/ratio_estimation_data.csv b/tests/data/winsorisation/ratio_estimation_data.csv index 39a5f069..ccc8485e 100755 --- a/tests/data/winsorisation/ratio_estimation_data.csv +++ b/tests/data/winsorisation/ratio_estimation_data.csv @@ -1,13 +1,13 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,predicted_unit_value,l_values -101,202401,10,0,1.666666667,1.023809524,12 -101,202312,23,1,1.666666667,1.023809524,20,22.361111,0.5 -101,202401,41,1,1.666666667,1.023809524,20,39.861111,0.5 -101,202401,53,1,1.666666667,1.023809524,40,51.527778,0.5 -101,202401,12,0,1.666666667,1.023809524,10 -102,202401,50,1,2.5,1.023809524,60,48.611111,0.5 -102,202401,40,1,2.5,1.023809524,50,38.888889,0.5 -102,202401,45,0,2.5,1.023809524,50 -102,202401,70,0,2.5,1.023809524,60 -102,202401,86,0,2.5,1.023809524,90 -104,202401,20,0,,0.004,90 -104,202401,30,0,,0.004,90 +strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value +101,202401,10,0,1.666666667,1.023809524,12,False,, +101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5 +101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5 +101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5 +101,202401,12,0,1.666666667,1.023809524,10,False,, +102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5 +102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5 +102,202401,45,0,2.5,1.023809524,50,True,, +102,202401,70,0,2.5,1.023809524,60,True,, +102,202401,86,0,2.5,1.023809524,90,False,, +104,202401,20,0,,0.004,90,False,, +104,202401,30,0,,0.004,90,True,, diff --git a/tests/data/winsorisation/ratio_estimation_data_output.csv b/tests/data/winsorisation/ratio_estimation_data_output.csv index ca0d5b54..9c4e4c78 100755 --- a/tests/data/winsorisation/ratio_estimation_data_output.csv +++ b/tests/data/winsorisation/ratio_estimation_data_output.csv @@ -1,6 +1,13 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,predicted_unit_value,l_values,ratio_estimation_treshold -101,202312,23,1,1.666666667,1.023809524,20,22.361111,0.5,23.068976 -101,202401,41,1,1.666666667,1.023809524,20,39.861111,0.5,40.568976 -101,202401,53,1,1.666666667,1.023809524,40,51.527778,0.5,52.235643 -102,202401,50,1,2.5,1.023809524,60,48.611111,0.5,48.931722 -102,202401,40,1,2.5,1.023809524,50,38.888889,0.5,39.209500 +strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold +101,202401,10,0,1.666666667,1.023809524,12,False,,, +101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976 +101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976 +101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643 +101,202401,12,0,1.666666667,1.023809524,10,False,,, +102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722 +102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500 +102,202401,45,0,2.5,1.023809524,50,True,,, +102,202401,70,0,2.5,1.023809524,60,True,,, +102,202401,86,0,2.5,1.023809524,90,False,,, +104,202401,20,0,,0.004,90,False,,, +104,202401,30,0,,0.004,90,True,,, diff --git a/tests/data/winsorisation/winsorised_weight_data.csv b/tests/data/winsorisation/winsorised_weight_data.csv index 84706398..9c4e4c78 100755 --- a/tests/data/winsorisation/winsorised_weight_data.csv +++ b/tests/data/winsorisation/winsorised_weight_data.csv @@ -1,6 +1,13 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_values,ratio_estimation_treshold -101,202312,23,1,1.666666667,1.023809524,20,0,22.361111,0.5, 23.0689763 -101,202401,41,1,1.666666667,1.023809524,20,0,39.861111,0.5,40.5689763 -101,202401,53,1,1.666666667,1.023809524,40,0,51.527778,0.5,52.2356429 -102,202401,50,1,2.5,1.023809524,60,0,48.611111,0.5,48.9317218 -102,202401,40,1,2.5,1.023809524,50,0,38.888889,0.5,39.2094996 +strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold +101,202401,10,0,1.666666667,1.023809524,12,False,,, +101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976 +101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976 +101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643 +101,202401,12,0,1.666666667,1.023809524,10,False,,, +102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722 +102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500 +102,202401,45,0,2.5,1.023809524,50,True,,, +102,202401,70,0,2.5,1.023809524,60,True,,, +102,202401,86,0,2.5,1.023809524,90,False,,, +104,202401,20,0,,0.004,90,False,,, +104,202401,30,0,,0.004,90,True,,, diff --git a/tests/data/winsorisation/winsorised_weight_data_output.csv b/tests/data/winsorisation/winsorised_weight_data_output.csv index 06178971..39bd3a7c 100755 --- a/tests/data/winsorisation/winsorised_weight_data_output.csv +++ b/tests/data/winsorisation/winsorised_weight_data_output.csv @@ -1,6 +1,13 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_values,ratio_estimation_treshold,new_target_variable,outlier_weight -101,202312,23,1,1.666666667,1.023809524,20,0,22.361111,0.5, 23.0689763, 20.0000, 1.00000 -101,202401,41,1,1.666666667,1.023809524,20,0,39.861111,0.5,40.5689763, 20.00000, 1.00000 -101,202401,53,1,1.666666667,1.023809524,40,0,51.527778,0.5,52.2356429, 40.00000,1.00000 -102,202401,50,1,2.5,1.023809524,60,0,48.611111,0.5,48.9317218, 53.256072,0.887601 -102,202401,40,1,2.5,1.023809524,50,0,38.888889,0.5,39.2094996, 43.425323,0.868506 +strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight +101,202401,10,0,1.666666667,1.023809524,12,False,,,,, +101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976,20.0000, 1.00000 +101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976,20.00000, 1.00000 +101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643,40.00000,1.00000 +101,202401,12,0,1.666666667,1.023809524,10,False,,,,, +102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722,53.256072,0.887601 +102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500,43.425323,0.868506 +102,202401,45,0,2.5,1.023809524,50,True,,,,, +102,202401,70,0,2.5,1.023809524,60,True,,,,, +102,202401,86,0,2.5,1.023809524,90,False,,,,, +104,202401,20,0,,0.004,90,False,,,,, +104,202401,30,0,,0.004,90,True,,,,, diff --git a/tests/test_calculate_predicted_unit_value.py b/tests/test_calculate_predicted_unit_value.py index fe814c80..2bdc493b 100755 --- a/tests/test_calculate_predicted_unit_value.py +++ b/tests/test_calculate_predicted_unit_value.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="class") def predicted_unit_value_test_data(): return pd.read_csv( - Path("tests") / "data" / "winsorisation" / "predicted_unit_value.csv", + Path("tests") / "data" / "winsorisation" / "predicted_unit_value_data.csv", low_memory=False, usecols=lambda c: not c.startswith("Unnamed:"), ) @@ -27,12 +27,10 @@ def predicted_unit_value_test_output(): class TestPredictedUnitValue: def test_calculate_predicted_unit_value( - self, predicted_unit_value_test_output, predicted_unit_value_test_data + self, predicted_unit_value_test_data, predicted_unit_value_test_output ): expected_output = predicted_unit_value_test_output[ [ - "period", - "strata", "aux", "sampled", "a_weight", @@ -43,25 +41,20 @@ def test_calculate_predicted_unit_value( ] input_data = predicted_unit_value_test_data[ [ - "period", - "strata", "aux", "sampled", "a_weight", "target_variable", "nw_ag_flag", - "predicted_unit_value", ] ] - input_data = input_data.drop(columns=["predicted_unit_value"]) + actual_output = calculate_predicted_unit_value( input_data, - "period", - "strata", "aux", "sampled", "a_weight", - "target_variale", + "target_variable", "nw_ag_flag", ) diff --git a/tests/test_calculate_ratio_estimation.py b/tests/test_calculate_ratio_estimation.py index e99c2a2d..c7087c29 100755 --- a/tests/test_calculate_ratio_estimation.py +++ b/tests/test_calculate_ratio_estimation.py @@ -31,43 +31,41 @@ def test_calculate_ratio_estimation( ): expected_output = ratio_estimation_test_output[ [ - "strata", - "period", "aux", "sampled", "a_weight", "g_weight", "target_variable", "predicted_unit_value", - "l_values", + "l_value", + "nw_ag_flag", "ratio_estimation_treshold", ] ] + input_data = ratio_estimation_test_data[ [ - "strata", - "period", "aux", "sampled", "a_weight", "g_weight", "target_variable", "predicted_unit_value", - "l_values", + "l_value", + "nw_ag_flag", ] ] actual_output = calculate_ratio_estimation( input_data, - "strata", - "period", "aux", "sampled", "a_weight", "g_weight", "target_variale", "predicted_unit_value", - "l_values", + "l_value", + "nw_ag_flag", ) assert_frame_equal(actual_output, expected_output) diff --git a/tests/test_calculate_winsorised_weight.py b/tests/test_calculate_winsorised_weight.py index 546b1377..f9268d5c 100755 --- a/tests/test_calculate_winsorised_weight.py +++ b/tests/test_calculate_winsorised_weight.py @@ -38,10 +38,10 @@ def test_winsorised_weight( "a_weight", "g_weight", "target_variable", - "nw_ag_flag", "predicted_unit_value", - "l_values", + "l_value", "ratio_estimation_treshold", + "nw_ag_flag", "new_target_variable", "outlier_weight", ] @@ -55,10 +55,10 @@ def test_winsorised_weight( "a_weight", "g_weight", "target_variable", - "nw_ag_flag", "predicted_unit_value", - "l_values", + "l_value", "ratio_estimation_treshold", + "nw_ag_flag", ] ] @@ -71,10 +71,10 @@ def test_winsorised_weight( "a_weight", "g_weight", "target_variable", - "nw_ag_flag", "predicted_unit_value", - "l_values", + "l_value", "ratio_estimation_treshold", + "nw_ag_flag", ) assert_frame_equal(actual_output, expected_output) diff --git a/tests/test_flag_for_winsorisation.py b/tests/test_flag_for_winsorisation.py index 9dff320b..849567b5 100755 --- a/tests/test_flag_for_winsorisation.py +++ b/tests/test_flag_for_winsorisation.py @@ -28,4 +28,4 @@ def test_winsorisation_flag(self, winsorisation_flag_test_data): df=df_input, a_weight="a_weight", g_weight="g_weight" ) - assert_frame_equal(df_output, df_expected_output) + assert_frame_equal(df_output, df_expected_output, check_dtype=False) From cee161d1ce7ca0e1b340f00c30645c368186df68 Mon Sep 17 00:00:00 2001 From: giaccg Date: Fri, 2 Aug 2024 11:39:34 +0100 Subject: [PATCH 2/2] fixed docs --- mbs_results/calculate_predicted_unit_value.py | 2 +- mbs_results/calculate_ratio_estimation.py | 6 +----- mbs_results/calculate_winsorised_weight.py | 11 ++++------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/mbs_results/calculate_predicted_unit_value.py b/mbs_results/calculate_predicted_unit_value.py index d5ec1c88..6b927241 100644 --- a/mbs_results/calculate_predicted_unit_value.py +++ b/mbs_results/calculate_predicted_unit_value.py @@ -21,7 +21,7 @@ def calculate_predicted_unit_value( Column name of the predicted target variable. nw_ag_flag: str column name indicating whether it can't be winsorised- - boolean (1 means it can't be winsorised, 0 means it can). + boolean (True means it can't be winsorised, False means it can). Returns ------- df : pd.DataFrame diff --git a/mbs_results/calculate_ratio_estimation.py b/mbs_results/calculate_ratio_estimation.py index e0105420..d0d6426e 100755 --- a/mbs_results/calculate_ratio_estimation.py +++ b/mbs_results/calculate_ratio_estimation.py @@ -19,10 +19,6 @@ def calculate_ratio_estimation( ---------- df : pd.Dataframe Original dataframe. - period : str - Column name containing time period. - strata : str - Column name containing strata information (sic). aux : str Column name containing auxiliary variable (x). sampled : str @@ -39,7 +35,7 @@ def calculate_ratio_estimation( column containing the l values provided by methodology nw_ag_flag: str column name indicating whether it can't be winsorised- - boolean (1 means it can't be winsorised, 0 means it can). + boolean (True means it can't be winsorised, False means it can). Returns ------- diff --git a/mbs_results/calculate_winsorised_weight.py b/mbs_results/calculate_winsorised_weight.py index b9980bac..f3bebbec 100755 --- a/mbs_results/calculate_winsorised_weight.py +++ b/mbs_results/calculate_winsorised_weight.py @@ -22,11 +22,11 @@ def calculate_winsorised_weight( Parameters ---------- df : pd.Dataframe - Original dataframe. - period : str - Column name containing time period. + Original dataframe strata : str Column name containing strata information (sic). + period : str + Column name containing time period. aux : str Column name containing auxiliary variable (x). sampled : str @@ -37,9 +37,6 @@ def calculate_winsorised_weight( column name containing the g weight. target_variable : str Column name of the predicted target variable. - nw_ag_flag: str - column name indicating whether it can't be winsorised- - boolean (1 means it can't be winsorised, 0 means it can). predicted_unit_value: str column name containing the predicted unit value. l_values: str @@ -48,7 +45,7 @@ def calculate_winsorised_weight( column name containing the previously calculated ratio estimation threshold. nw_ag_flag: str column name indicating whether it can't be winsorised- - boolean (1 means it can't be winsorised, 0 means it can). + boolean (True means it can't be winsorised, False means it can). Returns