Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

321 flag to ignore response #18

Merged
merged 3 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions src/link_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd

# TODO: Extend function to receive multiple df with *df_with_filters


def flag_rows_to_ignore(
df: pd.DataFrame, df_with_filters: pd.DataFrame
) -> pd.DataFrame:
"""
Add a new column bool column named ignore_from_link to df
having as TRUE the observations defined in df_with_filters.

Parameters
----------
df : pd.DataFrame
Original dataframe.
df_with_filters : pd.DataFrame
Dataframe with observations which should be flagged in the original
dataframe.

Returns
-------
df : pd.DataFrame
Original dataframe with a bool column containing the flags.

"""

if not set(df_with_filters.columns).issubset(df.columns):

raise ValueError(
f"""df_with_filters has these columns {list(df_with_filters)} while
df has these columns {list(df)}, please
double check the column names."""
)

# TODO: Check if values to be ignored exist

df = df.set_index(list(df_with_filters))

df_with_filters = df_with_filters.set_index(list(df_with_filters))

df["ignore_from_link"] = df.index.isin(df_with_filters.index)

df = df.reset_index()

# TODO: Consider what should be logged and reroute print to logs
print("These values were flagged:\n", df.loc[df["ignore_from_link"]])

return df
29 changes: 29 additions & 0 deletions tests/test_flag_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
identifier,date,group,question,other,ignore_from_link
70001,202001,100,5951.0,39,False
70001,202002,100,1814.0,39,False
70001,202003,100,734.0,39,True
70001,202004,100,96.0,39,False
70001,202005,100,9086.0,39,True
70001,202006,100,3949.0,39,False
70001,202007,100,49.0,39,False
70002,202001,100,6705.0,94,False
70002,202002,100,48.0,94,False
70002,202003,100,5361.0,94,False
70002,202004,100,8767.0,94,False
70002,202005,100,9214.0,94,False
70002,202006,100,7467.0,94,False
70002,202007,100,3475.0,94,False
70003,202001,100,6153.0,42,False
70003,202002,100,7711.0,42,False
70003,202003,100,5403.0,42,False
70003,202004,100,7445.0,42,False
70003,202005,100,7092.0,42,False
70003,202006,100,2038.0,42,False
70003,202007,100,8768.0,42,False
70004,202001,100,,6,False
70004,202002,100,,6,False
70004,202003,100,6288.0,6,False
70004,202004,100,,6,False
70004,202005,100,,6,False
70004,202006,100,5875.0,6,False
70004,202007,100,,6,False
3 changes: 3 additions & 0 deletions tests/test_flag_filters.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
identifier,date
70001,202003
70001,202005
39 changes: 39 additions & 0 deletions tests/test_link_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from src.link_filter import flag_rows_to_ignore


@pytest.mark.parametrize("scenario", ["test_flag_data"])
@pytest.mark.parametrize("filters", ["test_flag_filters"])
class TestFilters:
def test_basic_filter(self, scenario, filters):
"""Test ignore_from_link is correct"""

df_output_expected = pd.read_csv("tests/" + scenario + ".csv")

df_filters = pd.read_csv("tests/" + filters + ".csv")

df_input = df_output_expected.drop(columns=["ignore_from_link"])

df_output = flag_rows_to_ignore(df_input, df_filters)

assert_frame_equal(df_output, df_output_expected)

def test_exception(self, scenario, filters):

"""Test if function raises an exception when the columns in filters
do not exist in scenario."""

df_output_expected = pd.read_csv("tests/" + scenario + ".csv")

df_filters = pd.read_csv("tests/" + filters + ".csv")

df_input = df_output_expected.drop(columns=["ignore_from_link"])

with pytest.raises(ValueError):

df_filters.columns = df_filters.columns + "_fail"

flag_rows_to_ignore(df_input, df_filters)
Loading