From 761c28331d921fec6bcf7a00681b43c5b6e9d0d1 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 21 May 2024 11:15:14 +0100 Subject: [PATCH] Define strata and period as seperate inputs --- src/forward_link.py | 63 ++++++++++---------------------------- tests/test_forward_link.py | 6 ++-- 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 14870292..f58e5512 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -1,65 +1,30 @@ -from typing import List - import numpy as np import pandas as pd -def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series: - """Convert values in a dataframe column to 0 based on a python expression - - Parameters - ---------- - df : pd.Dataframe - Pandas dataframe of original data. - target_variable : List[str] or str - Column name(s) containing target variable(s). - query : str - The expression to evaluate, see here: - https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html - - Returns - ------- - df : pd.Series - - - """ - masked_column = df[target_variable].copy() - - try: - masked_column.loc[~(df.eval(expr))] = np.nan - - except ValueError: - print( - f"""{expr} is not a valid expression, - the code uses ~(df.eval({expr}) to mask the dataframe, please see here: - https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html - """ - ) - - return masked_column - - def calculate_imputation_link( df: pd.DataFrame, - groups: List[str] or str, + period: str, + strata: str, match_col: str, target_variable: str, predictive_variable: str, ) -> pd.Series: """ - Calculate link between target_variable and predictive_variable by given groups, - a match_col must be supplied which indicates if target_variable and - predictive_variable can be linked. + Calculate link between target_variable and predictive_variable by strata, + a match_col must be supplied which indicates if target_variable + and predictive_variable can be linked. Parameters ---------- df : pd.Dataframe Original dataframe. - groups : List[str] or str - Column name(s) to calculate the sums. + period : str + Column name containing time period. + strata : str + Column name containing strata information (sic). match_col : str - Column of the matched pair links, this column should be bool, - or 0 and 1. + Column name of the matched pair links, this column should be bool. target_variable : str Column name of the targeted variable. predictive_variable : str @@ -81,9 +46,13 @@ def calculate_imputation_link( df_intermediate[predictive_variable] * df_intermediate[match_col] ) - numerator = df_intermediate.groupby(groups)[target_variable].transform("sum") + numerator = df_intermediate.groupby([strata, period])[target_variable].transform( + "sum" + ) - denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum") + denominator = df_intermediate.groupby([strata, period])[ + predictive_variable + ].transform("sum") denominator.replace(0, np.nan, inplace=True) # cover division with 0 diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 74e32005..8012d001 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -18,7 +18,8 @@ def test_forward_links(self, scenario): link_to_test = calculate_imputation_link( df_input, - ["group", "period"], + "period", + "group", "f_matched_pair", "question", "f_predictive_question", @@ -35,7 +36,8 @@ def test_back_links(self, scenario): link_to_test = calculate_imputation_link( df_input, - ["group", "period"], + "period", + "group", "b_matched_pair", "question", "b_predictive_question",