From 26b4a35848ff308d9b5e93e46e504c46bb4ab3c2 Mon Sep 17 00:00:00 2001 From: giuliag92 <135708493+giuliag92@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:06:02 +0100 Subject: [PATCH] 419 Fixed grouping by period and group (#67) * fixed grouping by period and group * changed strata to group --- mbs_results/calculate_predicted_unit_value.py | 51 +++++++++++++++---- mbs_results/calculate_winsorised_weight.py | 6 +-- .../predicted_unit_value_data.csv | 6 +-- .../predicted_unit_value_output.csv | 12 ++--- .../winsorisation/ratio_estimation_data.csv | 12 ++--- .../ratio_estimation_data_output.csv | 12 ++--- .../winsorisation/winsorised_weight_data.csv | 12 ++--- .../winsorised_weight_data_output.csv | 12 ++--- tests/test_calculate_predicted_unit_value.py | 6 +++ tests/test_calculate_winsorised_weight.py | 6 +-- 10 files changed, 87 insertions(+), 48 deletions(-) diff --git a/mbs_results/calculate_predicted_unit_value.py b/mbs_results/calculate_predicted_unit_value.py index 3631bd4b..014e8860 100644 --- a/mbs_results/calculate_predicted_unit_value.py +++ b/mbs_results/calculate_predicted_unit_value.py @@ -2,7 +2,7 @@ def calculate_predicted_unit_value( - df, aux, sampled, a_weight, target_variable, nw_ag_flag + df, group, period, aux, sampled, a_weight, target_variable, nw_ag_flag ): """ Calculate predicted unit value @@ -11,6 +11,10 @@ def calculate_predicted_unit_value( ---------- df : pd.Dataframe Original dataframe. + group : str + Column name containing group information (sic). + period : str + Column name containing time period. aux : str Column name containing auxiliary variable (x). sampled : str @@ -31,16 +35,45 @@ def calculate_predicted_unit_value( winsorised = (df[sampled] == 1) & (df[nw_ag_flag] == False) # noqa: E712 filtered_df = df.loc[winsorised] - sum_weighted_target_values = ( + filtered_df["weighted_target_values"] = ( filtered_df[a_weight] * filtered_df[target_variable] - ).sum() - sum_weighted_auxiliary_values = (filtered_df[a_weight] * filtered_df[aux]).sum() + ) + filtered_df["weighted_auxiliary_values"] = filtered_df[a_weight] * filtered_df[aux] - df["predicted_unit_value"] = df[aux].apply( - lambda x: x * (sum_weighted_target_values / sum_weighted_auxiliary_values) + sum_weighted_target_values = ( + filtered_df.groupby([group, period])["weighted_target_values"] + .sum() + .to_frame(name="sum_weighted_target_values") + .reset_index() + ) + sum_weighted_auxiliary_values = ( + filtered_df.groupby([group, period])["weighted_auxiliary_values"] + .sum() + .to_frame(name="sum_weighted_auxiliary_values") + .reset_index() ) - non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] == True) # noqa: E712 - df["predicted_unit_value"] = df["predicted_unit_value"].mask(non_winsorised, np.nan) + total_sum_weighted = sum_weighted_target_values.merge( + sum_weighted_auxiliary_values, on=[group, period], how="left" + ) + + final_df = df.merge(total_sum_weighted, on=[group, period], how="left") + + final_df["predicted_unit_value"] = ( + final_df[aux] + * final_df["sum_weighted_target_values"] + / final_df["sum_weighted_auxiliary_values"] + ) + + final_df = final_df.drop( + ["sum_weighted_target_values", "sum_weighted_auxiliary_values"], axis=1 + ) + + non_winsorised = (final_df[sampled] == 0) | ( + final_df[nw_ag_flag] == True # noqa: E712 + ) + final_df["predicted_unit_value"] = final_df["predicted_unit_value"].mask( + non_winsorised, np.nan + ) - return df + return final_df diff --git a/mbs_results/calculate_winsorised_weight.py b/mbs_results/calculate_winsorised_weight.py index 34b96b35..05c6fbfe 100755 --- a/mbs_results/calculate_winsorised_weight.py +++ b/mbs_results/calculate_winsorised_weight.py @@ -3,7 +3,7 @@ def calculate_winsorised_weight( df, - strata, + group, period, aux, sampled, @@ -23,8 +23,8 @@ def calculate_winsorised_weight( ---------- df : pd.Dataframe Original dataframe - strata : str - Column name containing strata information (sic). + group : str + Column name containing group information (sic). period : str Column name containing time period. aux : str diff --git a/tests/data/winsorisation/predicted_unit_value_data.csv b/tests/data/winsorisation/predicted_unit_value_data.csv index bb513185..bc195391 100755 --- a/tests/data/winsorisation/predicted_unit_value_data.csv +++ b/tests/data/winsorisation/predicted_unit_value_data.csv @@ -1,11 +1,11 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag +group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag 101,202401,10,0,1.666666667,1.023809524,12,False 101,202401,23,1,1.666666667,1.023809524,20,False 101,202401,41,1,1.666666667,1.023809524,20,False -101,202401,53,1,1.666666667,1.023809524,40,False +101,202402,53,1,1.666666667,1.023809524,40,False 101,202401,12,0,1.666666667,1.023809524,10,False 102,202401,50,1,2.5,1.023809524,60,False -102,202401,40,1,2.5,1.023809524,50,False +102,202402,40,1,2.5,1.023809524,50,False 102,202401,45,0,2.5,1.023809524,50,False 102,202401,70,0,2.5,1.023809524,60,False 102,202401,86,0,2.5,1.023809524,90,True diff --git a/tests/data/winsorisation/predicted_unit_value_output.csv b/tests/data/winsorisation/predicted_unit_value_output.csv index c26a3373..4f68d059 100755 --- a/tests/data/winsorisation/predicted_unit_value_output.csv +++ b/tests/data/winsorisation/predicted_unit_value_output.csv @@ -1,11 +1,11 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value +group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value 101,202401,10,0,1.666666667,1.023809524,12,False, -101,202312,23,1,1.666666667,1.023809524,20,False,22.361111 -101,202401,41,1,1.666666667,1.023809524,20,False,39.861111 -101,202401,53,1,1.666666667,1.023809524,40,False,51.527778 +101,202401,23,1,1.666666667,1.023809524,20,False,14.375 +101,202401,41,1,1.666666667,1.023809524,20,False,25.625 +101,202402,53,1,1.666666667,1.023809524,40,False,40 101,202401,12,0,1.666666667,1.023809524,10,False, -102,202401,50,1,2.5,1.023809524,60,False,48.611111 -102,202401,40,1,2.5,1.023809524,50,False,38.888889 +102,202401,50,1,2.5,1.023809524,60,False,60 +102,202402,40,1,2.5,1.023809524,50,False,50 102,202401,45,0,2.5,1.023809524,50,False, 102,202401,70,0,2.5,1.023809524,60,False, 102,202401,86,0,2.5,1.023809524,90,True, diff --git a/tests/data/winsorisation/ratio_estimation_data.csv b/tests/data/winsorisation/ratio_estimation_data.csv index ccc8485e..a8b74585 100755 --- a/tests/data/winsorisation/ratio_estimation_data.csv +++ b/tests/data/winsorisation/ratio_estimation_data.csv @@ -1,11 +1,11 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value +group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value 101,202401,10,0,1.666666667,1.023809524,12,False,, -101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5 -101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5 -101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5 +101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5 +101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5 +101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5 101,202401,12,0,1.666666667,1.023809524,10,False,, -102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5 -102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5 +102,202401,50,1,2.5,1.023809524,60,False,60,0.5 +102,202402,40,1,2.5,1.023809524,50,False,50,0.5 102,202401,45,0,2.5,1.023809524,50,True,, 102,202401,70,0,2.5,1.023809524,60,True,, 102,202401,86,0,2.5,1.023809524,90,False,, diff --git a/tests/data/winsorisation/ratio_estimation_data_output.csv b/tests/data/winsorisation/ratio_estimation_data_output.csv index 9c4e4c78..f092517b 100755 --- a/tests/data/winsorisation/ratio_estimation_data_output.csv +++ b/tests/data/winsorisation/ratio_estimation_data_output.csv @@ -1,11 +1,11 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold +group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold 101,202401,10,0,1.666666667,1.023809524,12,False,,, -101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976 -101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976 -101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643 +101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652 +101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652 +101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652 101,202401,12,0,1.666666667,1.023809524,10,False,,, -102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722 -102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500 +102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107 +102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107 102,202401,45,0,2.5,1.023809524,50,True,,, 102,202401,70,0,2.5,1.023809524,60,True,,, 102,202401,86,0,2.5,1.023809524,90,False,,, diff --git a/tests/data/winsorisation/winsorised_weight_data.csv b/tests/data/winsorisation/winsorised_weight_data.csv index 9c4e4c78..f092517b 100755 --- a/tests/data/winsorisation/winsorised_weight_data.csv +++ b/tests/data/winsorisation/winsorised_weight_data.csv @@ -1,11 +1,11 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold +group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold 101,202401,10,0,1.666666667,1.023809524,12,False,,, -101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976 -101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976 -101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643 +101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652 +101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652 +101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652 101,202401,12,0,1.666666667,1.023809524,10,False,,, -102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722 -102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500 +102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107 +102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107 102,202401,45,0,2.5,1.023809524,50,True,,, 102,202401,70,0,2.5,1.023809524,60,True,,, 102,202401,86,0,2.5,1.023809524,90,False,,, diff --git a/tests/data/winsorisation/winsorised_weight_data_output.csv b/tests/data/winsorisation/winsorised_weight_data_output.csv index 39bd3a7c..cd706d62 100755 --- a/tests/data/winsorisation/winsorised_weight_data_output.csv +++ b/tests/data/winsorisation/winsorised_weight_data_output.csv @@ -1,11 +1,11 @@ -strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight +group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight 101,202401,10,0,1.666666667,1.023809524,12,False,,,,, -101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976,20.0000, 1.00000 -101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976,20.00000, 1.00000 -101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643,40.00000,1.00000 +101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227 +101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1 +101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1 101,202401,12,0,1.666666667,1.023809524,10,False,,,,, -102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722,53.256072,0.887601 -102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500,43.425323,0.868506 +102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1 +102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1 102,202401,45,0,2.5,1.023809524,50,True,,,,, 102,202401,70,0,2.5,1.023809524,60,True,,,,, 102,202401,86,0,2.5,1.023809524,90,False,,,,, diff --git a/tests/test_calculate_predicted_unit_value.py b/tests/test_calculate_predicted_unit_value.py index 2bdc493b..2693ec2d 100755 --- a/tests/test_calculate_predicted_unit_value.py +++ b/tests/test_calculate_predicted_unit_value.py @@ -31,6 +31,8 @@ def test_calculate_predicted_unit_value( ): expected_output = predicted_unit_value_test_output[ [ + "group", + "period", "aux", "sampled", "a_weight", @@ -41,6 +43,8 @@ def test_calculate_predicted_unit_value( ] input_data = predicted_unit_value_test_data[ [ + "group", + "period", "aux", "sampled", "a_weight", @@ -51,6 +55,8 @@ def test_calculate_predicted_unit_value( actual_output = calculate_predicted_unit_value( input_data, + "group", + "period", "aux", "sampled", "a_weight", diff --git a/tests/test_calculate_winsorised_weight.py b/tests/test_calculate_winsorised_weight.py index f9268d5c..89c4a349 100755 --- a/tests/test_calculate_winsorised_weight.py +++ b/tests/test_calculate_winsorised_weight.py @@ -31,7 +31,7 @@ def test_winsorised_weight( ): expected_output = winsorised_weight_test_output[ [ - "strata", + "group", "period", "aux", "sampled", @@ -48,7 +48,7 @@ def test_winsorised_weight( ] input_data = winsorised_weight_test_data[ [ - "strata", + "group", "period", "aux", "sampled", @@ -64,7 +64,7 @@ def test_winsorised_weight( actual_output = calculate_winsorised_weight( input_data, - "strata", + "group", "period", "aux", "sampled",