Skip to content

Commit

Permalink
Merge branch 'main' into 395-add-kwargs-low-level-fun
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonZogk authored Jun 20, 2024
2 parents ab710d0 + c5d1f5b commit 499400a
Show file tree
Hide file tree
Showing 15 changed files with 134 additions and 304 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ repos:
types: [python]
stages: [commit]
args: ["--verbose"]
exclude: ^playground/

#works
- repo: local
Expand Down
13 changes: 13 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pandas as pd

from src.utils.hdfs_mods import hdfs_load_json as read_json

# TODO: read from config
folder_path = "/dapsen/workspace_zone/mbs-results/"
file_name = "snapshot-202212-002-2156d36b-e61f-42f1-a0f1-61d1f8568b8e.json"
file_path = folder_path + file_name

snapshot = read_json(file_path)

contributors = pd.DataFrame(snapshot["contributors"])
responses = pd.DataFrame(snapshot["responses"])
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ black
isort
nbstripout
nbqa
#research_and_development==1.0.0
pre_commit_hooks
flake8
pandas==1.1.5
Expand Down
143 changes: 32 additions & 111 deletions src/flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,10 @@
import numpy as np
import pandas as pd


def flag_matched_pair_merge(
df,
forward_or_backward,
target,
period,
reference,
strata,
time_difference=1,
**kwargs
):
"""
function to add flag to df if data forms a matched pair
i.e. data is given for both period and predictive period
Parameters
----------
df : pd.DataFrame
pandas dataframe of original data
forward_or_backward: str
either f or b for forward or backward method
target : str
column name containing target variable
period : str
column name containing time period
reference : str
column name containing business reference id
strata : str
column name containing strata information (sic)
time_difference : int
time difference between predictive and target period in months
Returns
-------
pd.DataFrame
dataframe with column added flagging forward matched paris and
predictive target variable data column
"""

if forward_or_backward == "f":
time_difference = time_difference
elif forward_or_backward == "b":
time_difference = -time_difference

# Creating new DF, shifting period for forward or backward
df_with_predictive_column = df.copy()[[reference, strata, target]]
df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
months=time_difference
)
predictive_col_name = forward_or_backward + "_predictive_" + target
df_with_predictive_column.rename(
columns={target: predictive_col_name}, inplace=True
)
import numpy as np # noqa F401
import pandas as pd # noqa F401

df = df.merge(
df_with_predictive_column,
left_on=[reference, period, strata],
right_on=[reference, "predictive_period", strata],
how="left",
)

matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1), False, True
)

df.drop(["predictive_period"], axis=1, inplace=True)
return df


def flag_matched_pair_shift(
df, forward_or_backward, target, period, reference, strata, shift=1
def flag_matched_pair(
df, forward_or_backward, target, period, reference, strata, time_difference=1
):
"""
function to flag matched pairs using the shift method
Expand All @@ -82,7 +13,7 @@ def flag_matched_pair_shift(
----------
df : pd.DataFrame
pandas dataframe of original data
shift : int
forward_or_backward : str
number of rows to shift up or down
target : str
column name containing target variable
Expand All @@ -92,69 +23,59 @@ def flag_matched_pair_shift(
column name containing business reference id
strata : str
column name containing strata information (sic)
time_difference: int
lookup distance for matched pairs
Returns
-------
_type_
pandas dataframe with column added flagging forward matched pairs and
two pandas dataframes: the main dataframe with column added flagging
forward matched pairs and
predictive target variable data column
"""

if forward_or_backward == "f":
shift = shift
elif forward_or_backward == "b":
shift = -shift

df = df.sort_values(by=[reference, period])
predictive_col_name = forward_or_backward + "_predictive_" + target
df[[predictive_col_name, "predictive_period"]] = df.groupby(
[reference, strata]
).shift(shift)[[target, period]]

df["validate_date"] = np.where(
df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
)
matched_col_name = forward_or_backward + "_matched_pair_" + target
if forward_or_backward == "b":
time_difference = -time_difference

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
False,
True,
df[forward_or_backward + "_match"] = (
df.groupby([strata, reference])
.shift(time_difference)[target]
.notnull()
.mul(df[target].notnull())
.mul(
df[period] - pd.DateOffset(months=time_difference)
== df.shift(time_difference)[period]
)
)

df.drop(["validate_date", "predictive_period"], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

return df


def count_matches(
df, flag_column_name, period, strata, count_column_name=None, **kwargs
):
def count_matches(df, flag, period, strata):
"""
Function to count the number of records with matches per period and stratum
function to flag matched pairs using the shift method
Parameters
----------
df : pd.DataFrame
pandas dataframe of original data
flag_column_name : str
name of column containing flags if a match exists
pandas dataframe of original data with imputation flags
flag : str/list
the imputation flag column/s. Single string if one column, list of
strings for multiple columns.
period : str
column name containing time period
strata : str
column name containing strata information (sic)
count_col_name : str, None
name to give to count column. If `None`, name will be derived based on
flag column name
Returns
-------
pd.DataFrame
dataframe with column added for count of records with matches
_type_
pandas dataframe: match counts for each flag column.
"""
if count_column_name is None:
count_column_name = flag_column_name.split("_")[0] + "_matched_pair_count"
df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform(
"sum"
)
return df

return df.groupby([strata, period])[flag].agg("sum").reset_index()
18 changes: 9 additions & 9 deletions tests/construction_matches.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
period,strata,auxiliary,target,flag_construction_matches,count_construction_matches
202401,12340,120,,False,1
202401,12340,120,10,True,1
202402,12340,120,11,True,2
202402,12340,240,12,True,2
202403,12340,,13,False,1
202403,12340,240,14,True,1
202404,12340,120,15,True,1
202404,56789,240,16,True,1
period,strata,auxiliary,target,flag_construction_matches
202401,12340,120,,False
202401,12340,120,10,True
202402,12340,120,11,True
202402,12340,240,12,True
202403,12340,,13,False
202403,12340,240,14,True
202404,12340,120,15,True
202404,56789,240,16,True
22 changes: 0 additions & 22 deletions tests/test_construction_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pandas.testing import assert_frame_equal

from src.construction_matches import flag_construction_matches
from src.flag_and_count_matched_pairs import count_matches


@pytest.fixture(scope="class")
Expand All @@ -30,24 +29,3 @@ def test_construction_matches_flag(self, construction_test_data):
)

assert_frame_equal(actual_output, expected_output)

def test_construction_matches_count(self, construction_test_data):
expected_output = construction_test_data[
[
"period",
"flag_construction_matches",
"strata",
"count_construction_matches",
]
]

input_data = expected_output.drop(columns=["count_construction_matches"])
actual_output = count_matches(
input_data,
"flag_construction_matches",
"period",
"strata",
"count_construction_matches",
)

assert_frame_equal(actual_output, expected_output)
7 changes: 0 additions & 7 deletions tests/test_data_matched_pair/case1_expected_output.csv

This file was deleted.

9 changes: 0 additions & 9 deletions tests/test_data_matched_pair/case2_expected_output.csv

This file was deleted.

7 changes: 0 additions & 7 deletions tests/test_data_matched_pair/case3_expected_output.csv

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group,period,flag_1,flag_2
1,202401,1,0
1,202402,0,2
2,202401,2,1
2,202402,1,1
9 changes: 9 additions & 0 deletions tests/test_data_matched_pair/count_matches_input.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
period,group,flag_1,flag_2
202401,1,1,FALSE
202401,1,0,FALSE
202402,1,0,TRUE
202402,1,0,TRUE
202401,2,1,TRUE
202401,2,1,FALSE
202402,2,0,FALSE
202402,2,1,TRUE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
reference,strata,period,target_variable,f_match,b_match
1,101,202401,237,False,True
1,101,202402,281,True,False
1,101,202403,,False,False
2,101,202401,270,False,True
2,101,202402,250,True,True
2,101,202403,255,True,False
2,102,202404,260,False,True
2,102,202405,272,True,False
9 changes: 9 additions & 0 deletions tests/test_data_matched_pair/flag_pairs_expected_output.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
reference,strata,period,target_variable,f_match,b_match
1,101,202401,237,False,True
1,101,202402,281,True,False
1,101,202403,,False,False
2,101,202401,270,False,True
2,101,202402,250,True,True
2,101,202403,255,True,False
2,102,202404,260,False,True
2,102,202405,272,True,False
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
reference,strata,period,target_variable,f_match,b_match
1,101,202401,237,FALSE,TRUE
1,101,202402,100,TRUE,FALSE
1,101,202404,34,FALSE,TRUE
1,101,202405,19,TRUE,FALSE
Loading

0 comments on commit 499400a

Please sign in to comment.