diff --git a/mbs_results/staging/data_cleaning.py b/mbs_results/staging/data_cleaning.py index 119f6fe9..d08e5ed2 100644 --- a/mbs_results/staging/data_cleaning.py +++ b/mbs_results/staging/data_cleaning.py @@ -1,6 +1,5 @@ from typing import List -import numpy as np import pandas as pd from mbs_results.utilities.utils import convert_column_to_datetime @@ -310,6 +309,8 @@ def run_live_or_frozen( """ + df = df.copy() + if state not in ["frozen", "live"]: raise ValueError( """{} is not an accepted state status, use either frozen or live """.format( @@ -318,8 +319,10 @@ def run_live_or_frozen( ) if state == "frozen": - - df.loc[df[error_marker].isin(error_values), target] = np.nan + df["frozen_error"] = df.apply( + lambda x: x[target] if x[error_marker] in (error_values) else "", axis=1 + ) + df = df.fillna("") return df diff --git a/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv b/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv index c68ed0e5..02471894 100644 --- a/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv +++ b/tests/data/staging/data_cleaning/test_run_live_or_frozen.csv @@ -1,8 +1,8 @@ -target,error,live,frozen -1,C,1,1 -2,E,2, -3,O,3,3 -4,W,4, -5,C,5,5 -6,E,6, -7,W,7, +target,error,live,frozen,frozen_error +2,C,2,2, +7,E,7,,7 +1,O,1,1, +6,W,6,,6 +3,C,3,3, +5,E,5,,5 +4,W,4,,4 diff --git a/tests/staging/test_data_cleaning.py b/tests/staging/test_data_cleaning.py index cfa77c2c..fff60736 100644 --- a/tests/staging/test_data_cleaning.py +++ b/tests/staging/test_data_cleaning.py @@ -104,13 +104,16 @@ def test_run_live_or_frozen(filepath): df = pd.read_csv(filepath / "test_run_live_or_frozen.csv") - df_in = df.drop(columns=["frozen"]) + df_in = df.drop(columns=["frozen", "frozen_error"]) live_ouput = run_live_or_frozen(df_in, "target", "error", "live") + frozen_output = run_live_or_frozen(df_in, "target", "error", "frozen") - expected_output_frozen = df_in.copy() - expected_output_frozen["target"] = df["frozen"] + expected_output_frozen = df.copy() + + expected_output_frozen.drop(columns=["frozen"], inplace=True) + expected_output_frozen = expected_output_frozen.fillna("") assert_frame_equal(frozen_output, expected_output_frozen) assert_frame_equal(live_ouput, df_in)