Skip to content

Commit

Permalink
Define strata and period as seperate inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonZogk committed May 21, 2024
1 parent 4bc39c4 commit 761c283
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 49 deletions.
63 changes: 16 additions & 47 deletions src/forward_link.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,30 @@
from typing import List

import numpy as np
import pandas as pd


def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series:
"""Convert values in a dataframe column to 0 based on a python expression
Parameters
----------
df : pd.Dataframe
Pandas dataframe of original data.
target_variable : List[str] or str
Column name(s) containing target variable(s).
query : str
The expression to evaluate, see here:
https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html
Returns
-------
df : pd.Series
"""
masked_column = df[target_variable].copy()

try:
masked_column.loc[~(df.eval(expr))] = np.nan

except ValueError:
print(
f"""{expr} is not a valid expression,
the code uses ~(df.eval({expr}) to mask the dataframe, please see here:
https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html
"""
)

return masked_column


def calculate_imputation_link(
df: pd.DataFrame,
groups: List[str] or str,
period: str,
strata: str,
match_col: str,
target_variable: str,
predictive_variable: str,
) -> pd.Series:
"""
Calculate link between target_variable and predictive_variable by given groups,
a match_col must be supplied which indicates if target_variable and
predictive_variable can be linked.
Calculate link between target_variable and predictive_variable by strata,
a match_col must be supplied which indicates if target_variable
and predictive_variable can be linked.
Parameters
----------
df : pd.Dataframe
Original dataframe.
groups : List[str] or str
Column name(s) to calculate the sums.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
match_col : str
Column of the matched pair links, this column should be bool,
or 0 and 1.
Column name of the matched pair links, this column should be bool.
target_variable : str
Column name of the targeted variable.
predictive_variable : str
Expand All @@ -81,9 +46,13 @@ def calculate_imputation_link(
df_intermediate[predictive_variable] * df_intermediate[match_col]
)

numerator = df_intermediate.groupby(groups)[target_variable].transform("sum")
numerator = df_intermediate.groupby([strata, period])[target_variable].transform(
"sum"
)

denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum")
denominator = df_intermediate.groupby([strata, period])[
predictive_variable
].transform("sum")

denominator.replace(0, np.nan, inplace=True) # cover division with 0

Expand Down
6 changes: 4 additions & 2 deletions tests/test_forward_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def test_forward_links(self, scenario):

link_to_test = calculate_imputation_link(
df_input,
["group", "period"],
"period",
"group",
"f_matched_pair",
"question",
"f_predictive_question",
Expand All @@ -35,7 +36,8 @@ def test_back_links(self, scenario):

link_to_test = calculate_imputation_link(
df_input,
["group", "period"],
"period",
"group",
"b_matched_pair",
"question",
"b_predictive_question",
Expand Down

0 comments on commit 761c283

Please sign in to comment.