diff --git a/mbs_results/utilities/mapping_validation.py b/mbs_results/utilities/mapping_validation.py new file mode 100644 index 00000000..e8cc8903 --- /dev/null +++ b/mbs_results/utilities/mapping_validation.py @@ -0,0 +1,100 @@ +import warnings + +import pandas as pd + + +def wrap_mapping_validations(df: pd.DataFrame, mapping_folder: str): + """ + wrapper to loop over the specified mapping files in file_and_column_names dict. + Calls mapping_validation for the specified files + Probably could move the files and column names to the config if needed. + + Parameters + ---------- + df : pd.DataFrame + input data to test against mapping files + mapping_folder : str + folder where the mapping files can be located. + String must end with a '/' as we need this to be a folder where other mapping + files are located + """ + # Might be best to refactor this down the line and put into config if used + files_and_column_names = { + "sic_sut_mapping.csv": { + "df_column_name": "sic_5_digit", + "mapping_file_col_name": "sic", + }, + "classification_sic_mapping.csv": { + "df_column_name": "sic_5_digit", + "mapping_file_col_name": "sic_5_digit", + }, + "sic_domain_mapping.csv": { + "df_column_name": "sic_5_digit", + "mapping_file_col_name": "sic_5_digit", + }, + } + + for i in files_and_column_names: + mapping_path = mapping_folder + i + column_dict = files_and_column_names[i] + mapping_validation( + df, + mapping_path, + column_dict["df_column_name"], + column_dict["mapping_file_col_name"], + ) + + +def mapping_validation( + df: pd.DataFrame, + mapping_path: str, + df_column_name: str, + mapping_file_column_name: str, +): + """ + validation function to check mapping file against a column within a given dataframe + Only works with 'true' mapping files which two columns. Mapping files containing + three columns will produce an error. + + Parameters + ---------- + df : pd.DataFrame + input dataframe containing the data used in pipeline + mapping_path : str + path to mapping file to validate + df_column_name : str + column name of column to join on in df + mapping_file_column_name : str + column name of column to join on from mapping file + + Raises + ------ + Warning + Warns if data within the original dataframe has not been mapped using + the mapping file supplied + """ + + mapping_df = pd.read_csv(mapping_path) + df_subset = df[df_column_name] + new_column_name = [ + x for x in mapping_df.columns if x not in mapping_file_column_name + ] + new_column_name = "".join(new_column_name) + df_subset = pd.merge( + left=df_subset, + right=mapping_df, + left_on=df_column_name, + right_on=mapping_file_column_name, + how="left", + ) + unmatched = df_subset.loc[ + df_subset[new_column_name].isna(), df_column_name + ].to_list() + + if unmatched: + unmatched = set(unmatched) + mapping_file_name = mapping_path.split("/")[-1] + warnings.warn( + f"\n \n The following values from {df_column_name} in input dataframe " + + f"are not mapped using {mapping_file_name}: \n {unmatched}" + ) diff --git a/tests/data/utilities_data/mapping_missing.csv b/tests/data/utilities_data/mapping_missing.csv new file mode 100644 index 00000000..c109e5d3 --- /dev/null +++ b/tests/data/utilities_data/mapping_missing.csv @@ -0,0 +1,4 @@ +sic,domain +1,10 +2,20 +3,30 diff --git a/tests/test_utilities/test_mapping_validation.py b/tests/test_utilities/test_mapping_validation.py new file mode 100644 index 00000000..38a81d39 --- /dev/null +++ b/tests/test_utilities/test_mapping_validation.py @@ -0,0 +1,19 @@ +import pandas as pd +import pytest + +from mbs_results.utilities.mapping_validation import mapping_validation + +scenario_path_prefix = "tests/data/" + + +def test_mapping_validation(): + df = pd.DataFrame({"name": ["a", "b", "c", "d"], "sic": [1, 2, 3, 4]}) + + mapping_path = scenario_path_prefix + "utilities_data/mapping_missing.csv" + with pytest.warns(UserWarning): + mapping_validation( + df=df, + mapping_path=mapping_path, + df_column_name="sic", + mapping_file_column_name="sic", + )