Skip to content

Commit

Permalink
Feat shift_by_strata_period function
Browse files Browse the repository at this point in the history
*Add function which shifts data per strata and also checks for
consecutive dates.
*Add test data with multiple values for strata
*Add unit tests for forward and backward shifting
  • Loading branch information
AntonZogk committed Jun 19, 2024
1 parent c5d1f5b commit 4ce48c8
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 0 deletions.
58 changes: 58 additions & 0 deletions src/predictive_variable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pandas as pd

def shift_by_strata_period(
df: pd.DataFrame,
target: str,
period: str,
strata: str,
reference: str,
time_difference: int,
new_col: str,
**kwargs
) -> pd.DataFrame:
"""
It will perform the usual shift by desired time_difference for each value
in strata and for consecutive period.
Parameters
----------
df : pd.DataFrame
Pandas dataframe of original data
target : str
Column name containing target variable to be shifted.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
reference : str
Column name containing business reference id.
time_difference : int
Number of periods to shift. Can be positive or negative.
new_col : str
Column name containing the shifted values.
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
df : pd.DataFrame
Pandas dataframe of original data with a new column containing the
shifted values.
"""

df.sort_values([reference,strata, period], inplace=True)

df[new_col] = (
df.groupby((
(
df[period] - pd.DateOffset(months=1)
!= df.shift(1)[period]
)
| (df[strata].diff(1) != 0)
| (df[reference].diff(1) != 0)
)
.cumsum())
.shift(time_difference)[target])

return df
16 changes: 16 additions & 0 deletions tests/predictive_variable.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
identifier,period,group,question,other,f_predictive,b_predictive
10001,202001,1,547,10,,362
10001,202002,1,362,10,547,895
10001,202003,1,895,10,362,
10002,202001,1,381,50,,573
10002,202002,1,573,50,381,214
10002,202003,1,214,50,573,
10001,202001,2,961,12,,267
10001,202002,2,267,12,961,314
10001,202003,2,314,12,267,
10002,202001,2,555,9,,628
10002,202002,2,628,9,555,736
10002,202003,2,736,9,628,
10005,202001,1,,18,,
10005,202002,2,,18,,100
10005,202003,2,100,18,,
34 changes: 34 additions & 0 deletions tests/test_predictive_variable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from src.predictive_variable import shift_by_strata_period
from pathlib import Path
from helper_functions import load_and_format
from pandas.testing import assert_frame_equal

import pytest

@pytest.fixture(scope="class")
def predictive_variable_test_data():
return load_and_format(Path("tests") / "predictive_variable.csv")

class TestPredictiveVariable:
def test_predictive_variable_forward(self, predictive_variable_test_data):
expected_output = predictive_variable_test_data[
['identifier', 'period', 'group', 'question', 'other',"f_predictive"
]]
input_data = expected_output.drop(columns="f_predictive")
actual_output = shift_by_strata_period(
input_data, "question", "period", "group","identifier",1,"f_predictive"
)
actual_output.sort_index(ascending=True,inplace=True)
assert_frame_equal(actual_output, expected_output)

def test_predictive_variable_backward(self, predictive_variable_test_data):

expected_output = predictive_variable_test_data[
['identifier', 'period', 'group', 'question', 'other',"b_predictive"
]]
input_data = expected_output.drop(columns="b_predictive")
actual_output = shift_by_strata_period(
input_data, "question", "period", "group","identifier",-1,"b_predictive"
)
actual_output.sort_index(ascending=True,inplace=True)
assert_frame_equal(actual_output, expected_output)

0 comments on commit 4ce48c8

Please sign in to comment.