-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add compare_fields * Ignore * warnings * Refactor price * Support nested structures * Add normalization * Add more_stats to easily access all data, replace Result class eq with assert * Update rules to new assert * Fix tqdm warning
- Loading branch information
1 parent
0538719
commit 77075db
Showing
25 changed files
with
449 additions
and
278 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from typing import Tuple | ||
|
||
from arche.readers.schema import TaggedFields | ||
from arche.rules.result import * | ||
|
||
|
||
MAX_MISSING_VALUES = 6 | ||
|
||
|
||
def fields( | ||
source_df: pd.DataFrame, | ||
target_df: pd.DataFrame, | ||
names: List[str], | ||
normalize: bool = False, | ||
err_thr: float = 0.25, | ||
) -> Result: | ||
"""Finds fields values difference between dataframes. | ||
Args: | ||
names - a list of field names | ||
normalize - if set, all fields converted to str and processed with lower() and strip() | ||
err_thr - sets the failure threshold for missing values | ||
Returns: | ||
Result with same, missing and new values. | ||
""" | ||
|
||
def get_difference( | ||
left: pd.Series, right: pd.Series | ||
) -> Tuple[pd.Series, pd.Series, pd.Series]: | ||
return ( | ||
left[left.isin(right)], | ||
left[~(left.isin(right))], | ||
right[~(right.isin(left))], | ||
) | ||
|
||
result = Result("Fields Difference") | ||
for field in names: | ||
source = source_df[field].dropna() | ||
target = target_df[field].dropna() | ||
if normalize: | ||
source = source.astype(str).str.lower().str.strip() | ||
target = target.astype(str).str.lower().str.strip() | ||
try: | ||
same, new, missing = get_difference(source, target) | ||
except SystemError: | ||
source = source.astype(str) | ||
target = target.astype(str) | ||
same, new, missing = get_difference(source, target) | ||
|
||
same.name, new.name, missing.name = (None, None, None) | ||
result.more_stats.update( | ||
{f"{field}": {"same": same, "new": new, "missing": missing}} | ||
) | ||
result.add_info( | ||
f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" | ||
) | ||
if len(missing) == 0: | ||
continue | ||
|
||
if len(missing) < MAX_MISSING_VALUES: | ||
msg = ", ".join(missing.unique().astype(str)) | ||
else: | ||
msg = f"{', '.join(missing.unique()[:5].astype(str))}..." | ||
msg = f"{msg} `{field}s` are missing" | ||
if len(missing) / len(target_df) >= err_thr: | ||
result.add_error( | ||
f"{len(missing)} `{field}s` are missing", | ||
errors={msg: set(missing.index)}, | ||
) | ||
else: | ||
result.add_info( | ||
f"{len(missing)} `{field}s` are missing", | ||
errors={msg: set(missing.index)}, | ||
) | ||
return result | ||
|
||
|
||
def tagged_fields( | ||
source_df: pd.DataFrame, | ||
target_df: pd.DataFrame, | ||
tagged_fields: TaggedFields, | ||
tags: List[str], | ||
) -> Result: | ||
"""Compare fields tagged with `tags` between two dataframes.""" | ||
name = f"{', '.join(tags)} Fields Difference" | ||
result = Result(name) | ||
fields_names: List[str] = list() | ||
for tag in tags: | ||
tag_fields = tagged_fields.get(tag) | ||
if tag_fields: | ||
fields_names.extend(tag_fields) | ||
if not fields_names: | ||
result.add_info(Outcome.SKIPPED) | ||
return result | ||
result = fields(source_df, target_df, fields_names) | ||
result.name = name | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.