From 4ce48c8d719dce4821033f37d974745c0eab9119 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Wed, 19 Jun 2024 17:15:18 +0100
Subject: [PATCH] Feat shift_by_strata_period function

*Add function which shifts data per strata and also checks for
consecutive dates.
*Add test data with multiple values for strata
*Add unit tests for forward and backward shifting
---
 src/predictive_variable.py        | 58 +++++++++++++++++++++++++++++++
 tests/predictive_variable.csv     | 16 +++++++++
 tests/test_predictive_variable.py | 34 ++++++++++++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 src/predictive_variable.py
 create mode 100644 tests/predictive_variable.csv
 create mode 100644 tests/test_predictive_variable.py

diff --git a/src/predictive_variable.py b/src/predictive_variable.py
new file mode 100644
index 00000000..cc17aee7
--- /dev/null
+++ b/src/predictive_variable.py
@@ -0,0 +1,58 @@
+import pandas as pd
+
+def shift_by_strata_period(
+        df: pd.DataFrame,
+        target: str,
+        period: str,
+        strata: str,
+        reference: str,
+        time_difference: int,
+        new_col: str,
+        **kwargs
+) -> pd.DataFrame:
+    """
+    It will perform the usual shift by desired time_difference for each value
+    in strata and for consecutive period.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Pandas dataframe of original data
+    target : str
+        Column name containing target variable to be shifted.
+    period : str
+        Column name containing time period.
+    strata : str
+        Column name containing strata information (sic).
+    reference : str
+        Column name containing business reference id.
+    time_difference : int
+        Number of periods to shift. Can be positive or negative.
+    new_col : str
+        Column name containing the shifted values.
+     kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
+
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Pandas dataframe of original data with a new column containing the
+        shifted values.
+    """
+    
+    df.sort_values([reference,strata, period], inplace=True)
+
+    df[new_col] = (
+      df.groupby((
+             (
+                 df[period] - pd.DateOffset(months=1)
+                 != df.shift(1)[period]
+             )
+             | (df[strata].diff(1) != 0)
+             | (df[reference].diff(1) != 0)
+         )
+         .cumsum())
+      .shift(time_difference)[target])
+    
+    return df
\ No newline at end of file
diff --git a/tests/predictive_variable.csv b/tests/predictive_variable.csv
new file mode 100644
index 00000000..72c27a39
--- /dev/null
+++ b/tests/predictive_variable.csv
@@ -0,0 +1,16 @@
+identifier,period,group,question,other,f_predictive,b_predictive
+10001,202001,1,547,10,,362
+10001,202002,1,362,10,547,895
+10001,202003,1,895,10,362,
+10002,202001,1,381,50,,573
+10002,202002,1,573,50,381,214
+10002,202003,1,214,50,573,
+10001,202001,2,961,12,,267
+10001,202002,2,267,12,961,314
+10001,202003,2,314,12,267,
+10002,202001,2,555,9,,628
+10002,202002,2,628,9,555,736
+10002,202003,2,736,9,628,
+10005,202001,1,,18,,
+10005,202002,2,,18,,100
+10005,202003,2,100,18,,
diff --git a/tests/test_predictive_variable.py b/tests/test_predictive_variable.py
new file mode 100644
index 00000000..0b3672c0
--- /dev/null
+++ b/tests/test_predictive_variable.py
@@ -0,0 +1,34 @@
+from src.predictive_variable import shift_by_strata_period
+from pathlib import Path
+from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+import pytest
+
+@pytest.fixture(scope="class")
+def predictive_variable_test_data():
+    return load_and_format(Path("tests") / "predictive_variable.csv")
+
+class TestPredictiveVariable:
+    def test_predictive_variable_forward(self, predictive_variable_test_data):
+        expected_output = predictive_variable_test_data[
+            ['identifier', 'period', 'group', 'question', 'other',"f_predictive"
+        ]]
+        input_data = expected_output.drop(columns="f_predictive")
+        actual_output = shift_by_strata_period(
+            input_data, "question", "period", "group","identifier",1,"f_predictive"
+        )
+        actual_output.sort_index(ascending=True,inplace=True)
+        assert_frame_equal(actual_output, expected_output)
+
+    def test_predictive_variable_backward(self, predictive_variable_test_data):
+
+        expected_output = predictive_variable_test_data[
+            ['identifier', 'period', 'group', 'question', 'other',"b_predictive"
+        ]]
+        input_data = expected_output.drop(columns="b_predictive")
+        actual_output = shift_by_strata_period(
+            input_data, "question", "period", "group","identifier",-1,"b_predictive"
+        )
+        actual_output.sort_index(ascending=True,inplace=True)
+        assert_frame_equal(actual_output, expected_output)
\ No newline at end of file