Skip to content

Commit

Permalink
419 Fixed grouping by period and group (#67)
Browse files Browse the repository at this point in the history
* fixed grouping by period and group

* changed strata to group
  • Loading branch information
giuliag92 authored Aug 7, 2024
1 parent 39788bc commit 26b4a35
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 48 deletions.
51 changes: 42 additions & 9 deletions mbs_results/calculate_predicted_unit_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def calculate_predicted_unit_value(
df, aux, sampled, a_weight, target_variable, nw_ag_flag
df, group, period, aux, sampled, a_weight, target_variable, nw_ag_flag
):
"""
Calculate predicted unit value
Expand All @@ -11,6 +11,10 @@ def calculate_predicted_unit_value(
----------
df : pd.Dataframe
Original dataframe.
group : str
Column name containing group information (sic).
period : str
Column name containing time period.
aux : str
Column name containing auxiliary variable (x).
sampled : str
Expand All @@ -31,16 +35,45 @@ def calculate_predicted_unit_value(
winsorised = (df[sampled] == 1) & (df[nw_ag_flag] == False) # noqa: E712
filtered_df = df.loc[winsorised]

sum_weighted_target_values = (
filtered_df["weighted_target_values"] = (
filtered_df[a_weight] * filtered_df[target_variable]
).sum()
sum_weighted_auxiliary_values = (filtered_df[a_weight] * filtered_df[aux]).sum()
)
filtered_df["weighted_auxiliary_values"] = filtered_df[a_weight] * filtered_df[aux]

df["predicted_unit_value"] = df[aux].apply(
lambda x: x * (sum_weighted_target_values / sum_weighted_auxiliary_values)
sum_weighted_target_values = (
filtered_df.groupby([group, period])["weighted_target_values"]
.sum()
.to_frame(name="sum_weighted_target_values")
.reset_index()
)
sum_weighted_auxiliary_values = (
filtered_df.groupby([group, period])["weighted_auxiliary_values"]
.sum()
.to_frame(name="sum_weighted_auxiliary_values")
.reset_index()
)

non_winsorised = (df[sampled] == 0) | (df[nw_ag_flag] == True) # noqa: E712
df["predicted_unit_value"] = df["predicted_unit_value"].mask(non_winsorised, np.nan)
total_sum_weighted = sum_weighted_target_values.merge(
sum_weighted_auxiliary_values, on=[group, period], how="left"
)

final_df = df.merge(total_sum_weighted, on=[group, period], how="left")

final_df["predicted_unit_value"] = (
final_df[aux]
* final_df["sum_weighted_target_values"]
/ final_df["sum_weighted_auxiliary_values"]
)

final_df = final_df.drop(
["sum_weighted_target_values", "sum_weighted_auxiliary_values"], axis=1
)

non_winsorised = (final_df[sampled] == 0) | (
final_df[nw_ag_flag] == True # noqa: E712
)
final_df["predicted_unit_value"] = final_df["predicted_unit_value"].mask(
non_winsorised, np.nan
)

return df
return final_df
6 changes: 3 additions & 3 deletions mbs_results/calculate_winsorised_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

def calculate_winsorised_weight(
df,
strata,
group,
period,
aux,
sampled,
Expand All @@ -23,8 +23,8 @@ def calculate_winsorised_weight(
----------
df : pd.Dataframe
Original dataframe
strata : str
Column name containing strata information (sic).
group : str
Column name containing group information (sic).
period : str
Column name containing time period.
aux : str
Expand Down
6 changes: 3 additions & 3 deletions tests/data/winsorisation/predicted_unit_value_data.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
101,202401,10,0,1.666666667,1.023809524,12,False
101,202401,23,1,1.666666667,1.023809524,20,False
101,202401,41,1,1.666666667,1.023809524,20,False
101,202401,53,1,1.666666667,1.023809524,40,False
101,202402,53,1,1.666666667,1.023809524,40,False
101,202401,12,0,1.666666667,1.023809524,10,False
102,202401,50,1,2.5,1.023809524,60,False
102,202401,40,1,2.5,1.023809524,50,False
102,202402,40,1,2.5,1.023809524,50,False
102,202401,45,0,2.5,1.023809524,50,False
102,202401,70,0,2.5,1.023809524,60,False
102,202401,86,0,2.5,1.023809524,90,True
Expand Down
12 changes: 6 additions & 6 deletions tests/data/winsorisation/predicted_unit_value_output.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value
101,202401,10,0,1.666666667,1.023809524,12,False,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778
101,202401,23,1,1.666666667,1.023809524,20,False,14.375
101,202401,41,1,1.666666667,1.023809524,20,False,25.625
101,202402,53,1,1.666666667,1.023809524,40,False,40
101,202401,12,0,1.666666667,1.023809524,10,False,
102,202401,50,1,2.5,1.023809524,60,False,48.611111
102,202401,40,1,2.5,1.023809524,50,False,38.888889
102,202401,50,1,2.5,1.023809524,60,False,60
102,202402,40,1,2.5,1.023809524,50,False,50
102,202401,45,0,2.5,1.023809524,50,False,
102,202401,70,0,2.5,1.023809524,60,False,
102,202401,86,0,2.5,1.023809524,90,True,
Expand Down
12 changes: 6 additions & 6 deletions tests/data/winsorisation/ratio_estimation_data.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value
101,202401,10,0,1.666666667,1.023809524,12,False,,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5
101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5
101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5
101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5
101,202401,12,0,1.666666667,1.023809524,10,False,,
102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5
102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5
102,202401,50,1,2.5,1.023809524,60,False,60,0.5
102,202402,40,1,2.5,1.023809524,50,False,50,0.5
102,202401,45,0,2.5,1.023809524,50,True,,
102,202401,70,0,2.5,1.023809524,60,True,,
102,202401,86,0,2.5,1.023809524,90,False,,
Expand Down
12 changes: 6 additions & 6 deletions tests/data/winsorisation/ratio_estimation_data_output.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
101,202401,10,0,1.666666667,1.023809524,12,False,,,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643
101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652
101,202401,12,0,1.666666667,1.023809524,10,False,,,
102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722
102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500
102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107
102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107
102,202401,45,0,2.5,1.023809524,50,True,,,
102,202401,70,0,2.5,1.023809524,60,True,,,
102,202401,86,0,2.5,1.023809524,90,False,,,
Expand Down
12 changes: 6 additions & 6 deletions tests/data/winsorisation/winsorised_weight_data.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold
101,202401,10,0,1.666666667,1.023809524,12,False,,,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643
101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652
101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652
101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652
101,202401,12,0,1.666666667,1.023809524,10,False,,,
102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722
102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500
102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107
102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107
102,202401,45,0,2.5,1.023809524,50,True,,,
102,202401,70,0,2.5,1.023809524,60,True,,,
102,202401,86,0,2.5,1.023809524,90,False,,,
Expand Down
12 changes: 6 additions & 6 deletions tests/data/winsorisation/winsorised_weight_data_output.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
group,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag,predicted_unit_value,l_value,ratio_estimation_treshold,new_target_variable,outlier_weight
101,202401,10,0,1.666666667,1.023809524,12,False,,,,,
101,202312,23,1,1.666666667,1.023809524,20,False,22.361111,0.5,23.068976,20.0000, 1.00000
101,202401,41,1,1.666666667,1.023809524,20,False,39.861111,0.5,40.568976,20.00000, 1.00000
101,202401,53,1,1.666666667,1.023809524,40,False,51.527778,0.5,52.235643,40.00000,1.00000
101,202401,23,1,1.666666667,1.023809524,20,False,14.375,0.5,15.0828652,17.96453488,0.898227
101,202401,41,1,1.666666667,1.023809524,20,False,25.625,0.5,26.3328652,20,1
101,202402,53,1,1.666666667,1.023809524,40,False,40,0.5,40.7078652,40,1
101,202401,12,0,1.666666667,1.023809524,10,False,,,,,
102,202401,50,1,2.5,1.023809524,60,False,48.611111,0.5,48.931722,53.256072,0.887601
102,202401,40,1,2.5,1.023809524,50,False,38.888889,0.5,39.209500,43.425323,0.868506
102,202401,50,1,2.5,1.023809524,60,False,60,0.5,60.3206107,60,1
102,202402,40,1,2.5,1.023809524,50,False,50,0.5,50.3206107,50,1
102,202401,45,0,2.5,1.023809524,50,True,,,,,
102,202401,70,0,2.5,1.023809524,60,True,,,,,
102,202401,86,0,2.5,1.023809524,90,False,,,,,
Expand Down
6 changes: 6 additions & 0 deletions tests/test_calculate_predicted_unit_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def test_calculate_predicted_unit_value(
):
expected_output = predicted_unit_value_test_output[
[
"group",
"period",
"aux",
"sampled",
"a_weight",
Expand All @@ -41,6 +43,8 @@ def test_calculate_predicted_unit_value(
]
input_data = predicted_unit_value_test_data[
[
"group",
"period",
"aux",
"sampled",
"a_weight",
Expand All @@ -51,6 +55,8 @@ def test_calculate_predicted_unit_value(

actual_output = calculate_predicted_unit_value(
input_data,
"group",
"period",
"aux",
"sampled",
"a_weight",
Expand Down
6 changes: 3 additions & 3 deletions tests/test_calculate_winsorised_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_winsorised_weight(
):
expected_output = winsorised_weight_test_output[
[
"strata",
"group",
"period",
"aux",
"sampled",
Expand All @@ -48,7 +48,7 @@ def test_winsorised_weight(
]
input_data = winsorised_weight_test_data[
[
"strata",
"group",
"period",
"aux",
"sampled",
Expand All @@ -64,7 +64,7 @@ def test_winsorised_weight(

actual_output = calculate_winsorised_weight(
input_data,
"strata",
"group",
"period",
"aux",
"sampled",
Expand Down

0 comments on commit 26b4a35

Please sign in to comment.