-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Creating mapping validation function * Docstrings * Update docstring and leave unmatched as set * Update to raise warning and created wraper to test multiple mapping files in one go * adding test that passes when a warning raised * update docstring to ask that mapping file be a folder and not a file
- Loading branch information
Showing
3 changed files
with
123 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import warnings | ||
|
||
import pandas as pd | ||
|
||
|
||
def wrap_mapping_validations(df: pd.DataFrame, mapping_folder: str): | ||
""" | ||
wrapper to loop over the specified mapping files in file_and_column_names dict. | ||
Calls mapping_validation for the specified files | ||
Probably could move the files and column names to the config if needed. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
input data to test against mapping files | ||
mapping_folder : str | ||
folder where the mapping files can be located. | ||
String must end with a '/' as we need this to be a folder where other mapping | ||
files are located | ||
""" | ||
# Might be best to refactor this down the line and put into config if used | ||
files_and_column_names = { | ||
"sic_sut_mapping.csv": { | ||
"df_column_name": "sic_5_digit", | ||
"mapping_file_col_name": "sic", | ||
}, | ||
"classification_sic_mapping.csv": { | ||
"df_column_name": "sic_5_digit", | ||
"mapping_file_col_name": "sic_5_digit", | ||
}, | ||
"sic_domain_mapping.csv": { | ||
"df_column_name": "sic_5_digit", | ||
"mapping_file_col_name": "sic_5_digit", | ||
}, | ||
} | ||
|
||
for i in files_and_column_names: | ||
mapping_path = mapping_folder + i | ||
column_dict = files_and_column_names[i] | ||
mapping_validation( | ||
df, | ||
mapping_path, | ||
column_dict["df_column_name"], | ||
column_dict["mapping_file_col_name"], | ||
) | ||
|
||
|
||
def mapping_validation( | ||
df: pd.DataFrame, | ||
mapping_path: str, | ||
df_column_name: str, | ||
mapping_file_column_name: str, | ||
): | ||
""" | ||
validation function to check mapping file against a column within a given dataframe | ||
Only works with 'true' mapping files which two columns. Mapping files containing | ||
three columns will produce an error. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
input dataframe containing the data used in pipeline | ||
mapping_path : str | ||
path to mapping file to validate | ||
df_column_name : str | ||
column name of column to join on in df | ||
mapping_file_column_name : str | ||
column name of column to join on from mapping file | ||
Raises | ||
------ | ||
Warning | ||
Warns if data within the original dataframe has not been mapped using | ||
the mapping file supplied | ||
""" | ||
|
||
mapping_df = pd.read_csv(mapping_path) | ||
df_subset = df[df_column_name] | ||
new_column_name = [ | ||
x for x in mapping_df.columns if x not in mapping_file_column_name | ||
] | ||
new_column_name = "".join(new_column_name) | ||
df_subset = pd.merge( | ||
left=df_subset, | ||
right=mapping_df, | ||
left_on=df_column_name, | ||
right_on=mapping_file_column_name, | ||
how="left", | ||
) | ||
unmatched = df_subset.loc[ | ||
df_subset[new_column_name].isna(), df_column_name | ||
].to_list() | ||
|
||
if unmatched: | ||
unmatched = set(unmatched) | ||
mapping_file_name = mapping_path.split("/")[-1] | ||
warnings.warn( | ||
f"\n \n The following values from {df_column_name} in input dataframe " | ||
+ f"are not mapped using {mapping_file_name}: \n {unmatched}" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
sic,domain | ||
1,10 | ||
2,20 | ||
3,30 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from mbs_results.utilities.mapping_validation import mapping_validation | ||
|
||
scenario_path_prefix = "tests/data/" | ||
|
||
|
||
def test_mapping_validation(): | ||
df = pd.DataFrame({"name": ["a", "b", "c", "d"], "sic": [1, 2, 3, 4]}) | ||
|
||
mapping_path = scenario_path_prefix + "utilities_data/mapping_missing.csv" | ||
with pytest.warns(UserWarning): | ||
mapping_validation( | ||
df=df, | ||
mapping_path=mapping_path, | ||
df_column_name="sic", | ||
mapping_file_column_name="sic", | ||
) |