Merge branch 'main' into 395-add-kwargs-low-level-fun

ONSdigital · Jun 20, 2024 · 499400a · 499400a
2 parents ab710d0 + c5d1f5b
commit 499400a
Show file tree

Hide file tree

Showing 15 changed files with 134 additions and 304 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -73,6 +73,7 @@ repos:
         types: [python]
         stages: [commit]
         args: ["--verbose"]
+        exclude: ^playground/
 
 #works
 -   repo: local

diff --git a/main.py b/main.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+from src.utils.hdfs_mods import hdfs_load_json as read_json
+
+# TODO: read from config
+folder_path = "/dapsen/workspace_zone/mbs-results/"
+file_name = "snapshot-202212-002-2156d36b-e61f-42f1-a0f1-61d1f8568b8e.json"
+file_path = folder_path + file_name
+
+snapshot = read_json(file_path)
+
+contributors = pd.DataFrame(snapshot["contributors"])
+responses = pd.DataFrame(snapshot["responses"])
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ black
 isort
 nbstripout
 nbqa
+#research_and_development==1.0.0
 pre_commit_hooks
 flake8
 pandas==1.1.5

diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
@@ -1,79 +1,10 @@
-import numpy as np
-import pandas as pd
-
-
-def flag_matched_pair_merge(
-    df,
-    forward_or_backward,
-    target,
-    period,
-    reference,
-    strata,
-    time_difference=1,
-    **kwargs
-):
-    """
-    function to add flag to df if data forms a matched pair
-    i.e. data is given for both period and predictive period
-    Parameters
-    ----------
-    df : pd.DataFrame
-        pandas dataframe of original data
-    forward_or_backward: str
-        either f or b for forward or backward method
-    target : str
-        column name containing target variable
-    period : str
-        column name containing time period
-    reference : str
-        column name containing business reference id
-    strata : str
-        column name containing strata information (sic)
-    time_difference : int
-        time difference between predictive and target period in months
-
-
-    Returns
-    -------
-    pd.DataFrame
-        dataframe with column added flagging forward matched paris and
-        predictive target variable data column
-    """
-
-    if forward_or_backward == "f":
-        time_difference = time_difference
-    elif forward_or_backward == "b":
-        time_difference = -time_difference
-
-    # Creating new DF, shifting period for forward or backward
-    df_with_predictive_column = df.copy()[[reference, strata, target]]
-    df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
-        months=time_difference
-    )
-    predictive_col_name = forward_or_backward + "_predictive_" + target
-    df_with_predictive_column.rename(
-        columns={target: predictive_col_name}, inplace=True
-    )
+import numpy as np  # noqa F401
+import pandas as pd  # noqa F401
 
-    df = df.merge(
-        df_with_predictive_column,
-        left_on=[reference, period, strata],
-        right_on=[reference, "predictive_period", strata],
-        how="left",
-    )
 
-    matched_col_name = forward_or_backward + "_matched_pair_" + target
 
-    df[matched_col_name] = np.where(
-        df[[target, predictive_col_name]].isnull().any(axis=1), False, True
-    )
-
-    df.drop(["predictive_period"], axis=1, inplace=True)
-    return df
-
-
-def flag_matched_pair_shift(
-    df, forward_or_backward, target, period, reference, strata, shift=1
+def flag_matched_pair(
+    df, forward_or_backward, target, period, reference, strata, time_difference=1
 ):
     """
     function to flag matched pairs using the shift method
@@ -82,7 +13,7 @@ def flag_matched_pair_shift(
     ----------
     df : pd.DataFrame
         pandas dataframe of original data
-    shift : int
+    forward_or_backward : str
         number of rows to shift up or down
     target : str
         column name containing target variable
@@ -92,69 +23,59 @@ def flag_matched_pair_shift(
         column name containing business reference id
     strata : str
         column name containing strata information (sic)
+    time_difference: int
+        lookup distance for matched pairs
 
     Returns
     -------
     _type_
-        pandas dataframe with column added flagging forward matched pairs and
+        two pandas dataframes: the main dataframe with column added flagging
+        forward matched pairs and
         predictive target variable data column
     """
 
-    if forward_or_backward == "f":
-        shift = shift
-    elif forward_or_backward == "b":
-        shift = -shift
-
     df = df.sort_values(by=[reference, period])
-    predictive_col_name = forward_or_backward + "_predictive_" + target
-    df[[predictive_col_name, "predictive_period"]] = df.groupby(
-        [reference, strata]
-    ).shift(shift)[[target, period]]
 
-    df["validate_date"] = np.where(
-        df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
-    )
-    matched_col_name = forward_or_backward + "_matched_pair_" + target
+    if forward_or_backward == "b":
+        time_difference = -time_difference
 
-    df[matched_col_name] = np.where(
-        df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
-        False,
-        True,
+    df[forward_or_backward + "_match"] = (
+        df.groupby([strata, reference])
+        .shift(time_difference)[target]
+        .notnull()
+        .mul(df[target].notnull())
+        .mul(
+            df[period] - pd.DateOffset(months=time_difference)
+            == df.shift(time_difference)[period]
+        )
     )
 
-    df.drop(["validate_date", "predictive_period"], axis=1, inplace=True)
+    df.reset_index(drop=True, inplace=True)
 
     return df
 
 
-def count_matches(
-    df, flag_column_name, period, strata, count_column_name=None, **kwargs
-):
+def count_matches(df, flag, period, strata):
     """
-    Function to count the number of records with matches per period and stratum
+    function to flag matched pairs using the shift method
+
 
     Parameters
     ----------
     df : pd.DataFrame
-        pandas dataframe of original data
-    flag_column_name : str
-        name of column containing flags if a match exists
+        pandas dataframe of original data with imputation flags
+    flag : str/list
+        the imputation flag column/s. Single string if one column, list of
+        strings for multiple columns.
     period : str
         column name containing time period
     strata : str
         column name containing strata information (sic)
-    count_col_name : str, None
-        name to give to count column. If `None`, name will be derived based on
-        flag column name
 
     Returns
     -------
-    pd.DataFrame
-        dataframe with column added for count of records with matches
+    _type_
+        pandas dataframe: match counts for each flag column.
     """
-    if count_column_name is None:
-        count_column_name = flag_column_name.split("_")[0] + "_matched_pair_count"
-    df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform(
-        "sum"
-    )
-    return df
+
+    return df.groupby([strata, period])[flag].agg("sum").reset_index()
diff --git a/tests/construction_matches.csv b/tests/construction_matches.csv
@@ -1,9 +1,9 @@
-period,strata,auxiliary,target,flag_construction_matches,count_construction_matches
-202401,12340,120,,False,1
-202401,12340,120,10,True,1
-202402,12340,120,11,True,2
-202402,12340,240,12,True,2
-202403,12340,,13,False,1
-202403,12340,240,14,True,1
-202404,12340,120,15,True,1
-202404,56789,240,16,True,1
+period,strata,auxiliary,target,flag_construction_matches
+202401,12340,120,,False
+202401,12340,120,10,True
+202402,12340,120,11,True
+202402,12340,240,12,True
+202403,12340,,13,False
+202403,12340,240,14,True
+202404,12340,120,15,True
+202404,56789,240,16,True
diff --git a/tests/test_construction_matches.py b/tests/test_construction_matches.py
@@ -5,7 +5,6 @@
 from pandas.testing import assert_frame_equal
 
 from src.construction_matches import flag_construction_matches
-from src.flag_and_count_matched_pairs import count_matches
 
 
 @pytest.fixture(scope="class")
@@ -30,24 +29,3 @@ def test_construction_matches_flag(self, construction_test_data):
         )
 
         assert_frame_equal(actual_output, expected_output)
-
-    def test_construction_matches_count(self, construction_test_data):
-        expected_output = construction_test_data[
-            [
-                "period",
-                "flag_construction_matches",
-                "strata",
-                "count_construction_matches",
-            ]
-        ]
-
-        input_data = expected_output.drop(columns=["count_construction_matches"])
-        actual_output = count_matches(
-            input_data,
-            "flag_construction_matches",
-            "period",
-            "strata",
-            "count_construction_matches",
-        )
-
-        assert_frame_equal(actual_output, expected_output)
diff --git a/tests/test_data_matched_pair/case1_expected_output.csv b/tests/test_data_matched_pair/case1_expected_output.csv
diff --git a/tests/test_data_matched_pair/case2_expected_output.csv b/tests/test_data_matched_pair/case2_expected_output.csv
diff --git a/tests/test_data_matched_pair/case3_expected_output.csv b/tests/test_data_matched_pair/case3_expected_output.csv
diff --git a/tests/test_data_matched_pair/count_matches_expected_output.csv b/tests/test_data_matched_pair/count_matches_expected_output.csv
@@ -0,0 +1,5 @@
+group,period,flag_1,flag_2
+1,202401,1,0
+1,202402,0,2
+2,202401,2,1
+2,202402,1,1
diff --git a/tests/test_data_matched_pair/count_matches_input.csv b/tests/test_data_matched_pair/count_matches_input.csv
@@ -0,0 +1,9 @@
+period,group,flag_1,flag_2
+202401,1,1,FALSE
+202401,1,0,FALSE
+202402,1,0,TRUE
+202402,1,0,TRUE
+202401,2,1,TRUE
+202401,2,1,FALSE
+202402,2,0,FALSE
+202402,2,1,TRUE
diff --git a/tests/test_data_matched_pair/flag_pairs_2_groups_expected_output.csv b/tests/test_data_matched_pair/flag_pairs_2_groups_expected_output.csv
@@ -0,0 +1,9 @@
+reference,strata,period,target_variable,f_match,b_match
+1,101,202401,237,False,True
+1,101,202402,281,True,False
+1,101,202403,,False,False
+2,101,202401,270,False,True
+2,101,202402,250,True,True
+2,101,202403,255,True,False
+2,102,202404,260,False,True
+2,102,202405,272,True,False
diff --git a/tests/test_data_matched_pair/flag_pairs_expected_output.csv b/tests/test_data_matched_pair/flag_pairs_expected_output.csv
@@ -0,0 +1,9 @@
+reference,strata,period,target_variable,f_match,b_match
+1,101,202401,237,False,True
+1,101,202402,281,True,False
+1,101,202403,,False,False
+2,101,202401,270,False,True
+2,101,202402,250,True,True
+2,101,202403,255,True,False
+2,102,202404,260,False,True
+2,102,202405,272,True,False
diff --git a/tests/test_data_matched_pair/flag_pairs_missing_rows_expected_output.csv b/tests/test_data_matched_pair/flag_pairs_missing_rows_expected_output.csv
@@ -0,0 +1,5 @@
+reference,strata,period,target_variable,f_match,b_match
+1,101,202401,237,FALSE,TRUE
+1,101,202402,100,TRUE,FALSE
+1,101,202404,34,FALSE,TRUE
+1,101,202405,19,TRUE,FALSE