ONSdigital · robertswh · Jun 5, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
@@ -0,0 +1,49 @@
+import pandas as pd
+
+# TODO: Extend function to receive multiple df with *df_with_filters
+
+
+def flag_rows_to_ignore(
+    df: pd.DataFrame, df_with_filters: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Add a new column bool column named ignore_from_link to df
+    having as TRUE the observations defined in df_with_filters.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe.
+    df_with_filters : pd.DataFrame
+        Dataframe with observations which should be flagged in the original
+        dataframe.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Original dataframe with a bool column containing the flags.
+
+    """
+
+    if not set(df_with_filters.columns).issubset(df.columns):
+
+        raise ValueError(
+            f"""df_with_filters has these columns {list(df_with_filters)} while
+            df has these columns {list(df)}, please
+            double check the column names."""
+        )
+
+    # TODO: Check if values to be ignored exist
+
+    df = df.set_index(list(df_with_filters))
+
+    df_with_filters = df_with_filters.set_index(list(df_with_filters))
+
+    df["ignore_from_link"] = df.index.isin(df_with_filters.index)
+
+    df = df.reset_index()
+
+    # TODO: Consider what should be logged and reroute print to logs
+    print("These values were flagged:\n", df.loc[df["ignore_from_link"]])
+
+    return df
@@ -0,0 +1,29 @@
+identifier,date,group,question,other,ignore_from_link
+70001,202001,100,5951.0,39,False
+70001,202002,100,1814.0,39,False
+70001,202003,100,734.0,39,True
+70001,202004,100,96.0,39,False
+70001,202005,100,9086.0,39,True
+70001,202006,100,3949.0,39,False
+70001,202007,100,49.0,39,False
+70002,202001,100,6705.0,94,False
+70002,202002,100,48.0,94,False
+70002,202003,100,5361.0,94,False
+70002,202004,100,8767.0,94,False
+70002,202005,100,9214.0,94,False
+70002,202006,100,7467.0,94,False
+70002,202007,100,3475.0,94,False
+70003,202001,100,6153.0,42,False
+70003,202002,100,7711.0,42,False
+70003,202003,100,5403.0,42,False
+70003,202004,100,7445.0,42,False
+70003,202005,100,7092.0,42,False
+70003,202006,100,2038.0,42,False
+70003,202007,100,8768.0,42,False
+70004,202001,100,,6,False
+70004,202002,100,,6,False
+70004,202003,100,6288.0,6,False
+70004,202004,100,,6,False
+70004,202005,100,,6,False
+70004,202006,100,5875.0,6,False
+70004,202007,100,,6,False
@@ -0,0 +1,3 @@
+identifier,date
+70001,202003
+70001,202005
@@ -0,0 +1,39 @@
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal
+
+from src.link_filter import flag_rows_to_ignore
+
+
+@pytest.mark.parametrize("scenario", ["test_flag_data"])
+@pytest.mark.parametrize("filters", ["test_flag_filters"])
+class TestFilters:
+    def test_basic_filter(self, scenario, filters):
+        """Test ignore_from_link is correct"""
+
+        df_output_expected = pd.read_csv("tests/" + scenario + ".csv")
+
+        df_filters = pd.read_csv("tests/" + filters + ".csv")
+
+        df_input = df_output_expected.drop(columns=["ignore_from_link"])
+
+        df_output = flag_rows_to_ignore(df_input, df_filters)
+
+        assert_frame_equal(df_output, df_output_expected)
+
+    def test_exception(self, scenario, filters):
+
+        """Test if function raises an exception when the columns in filters
+        do not exist in scenario."""
+
+        df_output_expected = pd.read_csv("tests/" + scenario + ".csv")
+
+        df_filters = pd.read_csv("tests/" + filters + ".csv")
+
+        df_input = df_output_expected.drop(columns=["ignore_from_link"])
+
+        with pytest.raises(ValueError):
+
+            df_filters.columns = df_filters.columns + "_fail"
+
+            flag_rows_to_ignore(df_input, df_filters)