From e336d336b9d971ab2f4eaff8452cfbe23cfa4bb5 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 12 Sep 2019 12:21:51 -0300 Subject: [PATCH 01/10] Add compare_fields --- docs/source/nbs/Rules.ipynb | 26 +++++++++++++++++++ src/arche/rules/compare.py | 47 ++++++++++++++++++++++++++++++++++ tests/rules/test_compare.py | 51 +++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 src/arche/rules/compare.py create mode 100644 tests/rules/test_compare.py diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb index 5089f1a..35520ac 100644 --- a/docs/source/nbs/Rules.ipynb +++ b/docs/source/nbs/Rules.ipynb @@ -200,6 +200,32 @@ "arche.rules.category.get_difference(df, target_df, [\"category\"]).show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare\n", + "### Fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "help(arche.rules.compare.fields)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py new file mode 100644 index 0000000..eeee202 --- /dev/null +++ b/src/arche/rules/compare.py @@ -0,0 +1,47 @@ +from typing import List + +from arche.rules.result import Result +import pandas as pd + + +def fields( + source_df: pd.DataFrame, + target_df: pd.DataFrame, + fields: List[str], + err_thr: float = 0.25, +) -> Result: + """Return field values difference between jobs""" + + result = Result("Fields Difference") + + for field in fields: + source = source_df[field].dropna() + target = target_df[field].dropna() + same = source[source.isin(target)] + new = source[~(source.isin(target))] + result.add_info( + f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" + ) + missing = target[~(target.isin(source))] + missing_values = missing.values + if len(missing_values) == 0: + continue + + if len(missing) < 6: + msg = ", ".join(missing_values.astype(str)) + else: + missing_values = missing[:5].values + msg = f"{', '.join(missing_values.astype(str))}..." + msg = f"{msg} `{field}s` are missing" + if len(missing) / len(target_df) >= err_thr: + result.add_error( + f"{len(missing)} `{field}s` are missing", + errors={msg: set(missing.index)}, + ) + else: + result.add_info( + f"{len(missing)} `{field}s` are missing", + errors={msg: set(missing.index)}, + ) + + return result diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py new file mode 100644 index 0000000..685dc08 --- /dev/null +++ b/tests/rules/test_compare.py @@ -0,0 +1,51 @@ +import arche.rules.compare as compare +from arche.rules.result import Level +from conftest import * +import pytest + + +@pytest.mark.parametrize( + ["source", "target", "fields", "expected"], + [ + ( + { + "one": list(range(50)) + [42] * 50, + "two": list(range(100)), + "three": [np.nan] * 50 + list(range(50)), + }, + { + "one": list(range(50, 100)) + [42] * 500, + "two": list(range(550)), + "three": [np.nan] * 500 + list(range(50)), + }, + ["one", "two", "three"], + { + Level.INFO: [ + ("100 `non NaN ones` - 49 new, 51 same",), + ( + "50 `ones` are missing", + None, + {"50, 51, 52, 53, 54... `ones` are missing": set(range(50))}, + ), + ("100 `non NaN twos` - 0 new, 100 same",), + ("50 `non NaN threes` - 0 new, 50 same",), + ], + Level.ERROR: [ + ( + "450 `twos` are missing", + None, + { + "100, 101, 102, 103, 104... `twos` are missing": set( + range(100, 550) + ) + }, + ) + ], + }, + ) + ], +) +def test_fields(source, target, fields, expected): + assert compare.fields( + pd.DataFrame(source), pd.DataFrame(target), fields + ) == create_result("Fields Difference", expected) From e89ac5a71fd2a86f1dc06600ad48fe2d4f7d13a9 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 12 Sep 2019 12:21:58 -0300 Subject: [PATCH 02/10] Ignore * warnings --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 077c79c..46f6fc8 100755 --- a/tox.ini +++ b/tox.ini @@ -24,7 +24,7 @@ commands = [flake8] select = C,E,F,W,I,D,B,B9 -ignore = W503, E741, E501, E203, I101 +ignore = W503, E741, E501, E203, I101, F403, F405 exclude = .tox, .git, From 46a6ab7704a6bfe352446fd04904213ea855bd13 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 12 Sep 2019 14:59:58 -0300 Subject: [PATCH 03/10] Refactor price --- src/arche/arche.py | 9 ++++++ src/arche/readers/schema.py | 2 +- src/arche/rules/compare.py | 38 +++++++++++++++++++------ src/arche/rules/price.py | 55 ++++--------------------------------- tests/rules/test_price.py | 14 ++-------- 5 files changed, 47 insertions(+), 71 deletions(-) diff --git a/src/arche/arche.py b/src/arche/arche.py index 3e67ee5..fa0c2f0 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -7,6 +7,7 @@ from arche.readers.schema import Schema, SchemaSource from arche.report import Report import arche.rules.category as category_rules +import arche.rules.compare as compare import arche.rules.coverage as coverage_rules import arche.rules.duplicates as duplicate_rules import arche.rules.json_schema as schema_rules @@ -256,3 +257,11 @@ def compare_with_customized_rules(self, source_items, target_items, tagged_field price_rules.compare_prices_for_same_names, ]: self.save_result(r(source_items.df, target_items.df, tagged_fields)) + self.save_result( + compare.tagged_fields( + source_items.df, + target_items.df, + tagged_fields, + ["product_url_field", "name_field"], + ) + ) diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py index 06b8883..5a1d369 100755 --- a/src/arche/readers/schema.py +++ b/src/arche/readers/schema.py @@ -69,7 +69,7 @@ def get_tags(schema: RawSchema) -> TaggedFields: property_tags = value.get("tag", []) if property_tags: tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields) - return tagged_fields + return dict(tagged_fields) @classmethod def get_field_tags( diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py index eeee202..ab2fa81 100644 --- a/src/arche/rules/compare.py +++ b/src/arche/rules/compare.py @@ -1,20 +1,21 @@ from typing import List -from arche.rules.result import Result +from arche.readers.schema import TaggedFields +from arche.rules.result import * import pandas as pd def fields( source_df: pd.DataFrame, target_df: pd.DataFrame, - fields: List[str], + names: List[str], err_thr: float = 0.25, ) -> Result: - """Return field values difference between jobs""" + """Return fields values difference between dataframes""" result = Result("Fields Difference") - for field in fields: + for field in names: source = source_df[field].dropna() target = target_df[field].dropna() same = source[source.isin(target)] @@ -23,15 +24,13 @@ def fields( f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" ) missing = target[~(target.isin(source))] - missing_values = missing.values - if len(missing_values) == 0: + if len(missing) == 0: continue if len(missing) < 6: - msg = ", ".join(missing_values.astype(str)) + msg = ", ".join(missing.unique().astype(str)) else: - missing_values = missing[:5].values - msg = f"{', '.join(missing_values.astype(str))}..." + msg = f"{', '.join(missing.unique()[:5].astype(str))}..." msg = f"{msg} `{field}s` are missing" if len(missing) / len(target_df) >= err_thr: result.add_error( @@ -45,3 +44,24 @@ def fields( ) return result + + +def tagged_fields( + source_df: pd.DataFrame, + target_df: pd.DataFrame, + tagged_fields: TaggedFields, + tags: List[str], +) -> Result: + """Compare fields tagged with `tags` between two dataframes.""" + name = f"{', '.join(tags)} Fields Difference" + result = Result(name) + fields_names = list() + for tag in tags: + if tagged_fields.get(tag): + fields_names.extend(tagged_fields.get(tag)) + if not fields_names: + result.add_info(Outcome.SKIPPED) + return result + result = fields(source_df, target_df, fields_names) + result.name = name + return result diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index 868ee7a..7216e42 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -65,14 +65,13 @@ def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields): def compare_prices_for_same_urls( source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields -): +) -> Result: """For each pair of items that have the same `product_url_field` tagged field, compare `product_price_field` field Returns: - A result containing pairs of items with same `product_url_field` - from `source_df` and `target_df` which `product_price_field` differ, - missing and new `product_url_field` tagged fields. + A result containing pairs of items from `source_df` and `target_df` + which `product_price_field` differ. """ result = Result("Compare Prices For Same Urls") url_field = tagged_fields.get("product_url_field") @@ -88,26 +87,7 @@ def compare_prices_for_same_urls( same_urls = source_df[(source_df[url_field].isin(target_df[url_field].values))][ url_field ] - new_urls = source_df[~(source_df[url_field].isin(target_df[url_field].values))][ - url_field - ] - missing_urls = target_df[(~target_df[url_field].isin(source_df[url_field].values))][ - url_field - ] - - errors = {} - for url, group in missing_urls.groupby(missing_urls): - errors[f"Missing {url}"] = set(group.index) - - if not missing_urls.empty: - result.add_info( - f"{len(missing_urls)} urls missing from the tested job", errors=errors - ) - if not new_urls.empty: - result.add_info(f"{len(new_urls)} new urls in the tested job") - result.add_info(f"{len(same_urls)} same urls in both jobs") - diff_prices_count = 0 price_field = tagged_fields.get("product_price_field") if not price_field: result.add_info("product_price_field tag is not set") @@ -128,7 +108,6 @@ def compare_prices_for_same_urls( and is_number(target_price) and ratio_diff(source_price, target_price) > 0.1 ): - diff_prices_count += 1 source_key = source_df[source_df[url_field] == url].index[0] target_key = target_df[target_df[url_field] == url].index[0] msg = ( @@ -137,7 +116,7 @@ def compare_prices_for_same_urls( ) detailed_messages.append(msg) - res = f"{len(same_urls)} checked, {diff_prices_count} errors" + res = f"{len(same_urls)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: @@ -212,33 +191,12 @@ def compare_prices_for_same_names( same_names = source_df[(source_df[name_field].isin(target_df[name_field].values))][ name_field ] - new_names = source_df[~(source_df[name_field].isin(target_df[name_field].values))][ - name_field - ] - missing_names = target_df[ - ~(target_df[name_field].isin(source_df[name_field].values)) - ][name_field] - errors = {} - for name, group in missing_names.groupby(missing_names): - errors[f"Missing {name}"] = set(group.index) - - if not missing_names.empty: - result.add_info( - f"{len(missing_names)} names missing from the tested job", errors=errors - ) - if not new_names.empty: - result.add_info(f"{len(new_names)} new names in the tested job") - result.add_info(f"{len(same_names)} same names in both jobs") - - price_tag = "product_price_field" - price_field = tagged_fields.get(price_tag) + price_field = tagged_fields.get("product_price_field") if not price_field: result.add_info("product_price_field tag is not set") return result - price_field = price_field[0] - count = 0 detailed_messages = [] for name in same_names: @@ -247,7 +205,6 @@ def compare_prices_for_same_names( target_price = target_df[target_df[name_field] == name][price_field].iloc[0] if is_number(source_price) and is_number(target_price): if ratio_diff(source_price, target_price) > 0.1: - count += 1 source_key = source_df[source_df[name_field] == name].index[0] target_key = target_df[target_df[name_field] == name].index[0] msg = ( @@ -256,7 +213,7 @@ def compare_prices_for_same_names( ) detailed_messages.append(msg) - result_msg = f"{len(same_names)} checked, {count} errors" + result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(result_msg, detailed="\n".join(detailed_messages)) else: diff --git a/tests/rules/test_price.py b/tests/rules/test_price.py index b6dca99..237a13c 100755 --- a/tests/rules/test_price.py +++ b/tests/rules/test_price.py @@ -63,7 +63,6 @@ def test_compare_was_now(data, tagged_fields, expected_messages): {"price": [1.15, "2.3", 6], "url": ["http://1", "http://2", np.nan]}, {"product_price_field": ["price"], "product_url_field": ["url"]}, { - Level.INFO: [("2 same urls in both jobs",)], Level.ERROR: [ ( "2 checked, 2 errors", @@ -74,7 +73,7 @@ def test_compare_was_now(data, tagged_fields, expected_messages): "target price is 1.15 for 0" ), ) - ], + ] }, ) ] @@ -134,15 +133,6 @@ def test_compare_names_for_same_urls( {"name": ["Coffee", "Tea", "Wine"], "price": [4.0, 4.8, 20.0]}, {"name_field": ["name"], "product_price_field": ["price"]}, { - Level.INFO: [ - ( - "1 names missing from the tested job", - None, - {"Missing Wine": {2}}, - ), - ("1 new names in the tested job",), - ("2 same names in both jobs",), - ], Level.ERROR: [ ( "2 checked, 1 errors", @@ -151,7 +141,7 @@ def test_compare_names_for_same_urls( "target price is 4.0 for 0" ), ) - ], + ] }, ) ], From 6f2b100e127e2d1b7296ffce74e91280add62efe Mon Sep 17 00:00:00 2001 From: manycoding Date: Fri, 13 Sep 2019 17:25:16 -0300 Subject: [PATCH 04/10] Support nested structures --- src/arche/rules/compare.py | 15 +++++++++++---- tests/rules/test_compare.py | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py index ab2fa81..9af27d6 100644 --- a/src/arche/rules/compare.py +++ b/src/arche/rules/compare.py @@ -14,16 +14,23 @@ def fields( """Return fields values difference between dataframes""" result = Result("Fields Difference") - for field in names: source = source_df[field].dropna() target = target_df[field].dropna() - same = source[source.isin(target)] - new = source[~(source.isin(target))] + try: + same = source[source.isin(target)] + new = source[~(source.isin(target))] + missing = target[~(target.isin(source))] + except SystemError: + source = source.apply(str) + target = target.apply(str) + same = source[source.isin(target)] + new = source[~(source.isin(target))] + missing = target[~(target.isin(source))] + result.add_info( f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" ) - missing = target[~(target.isin(source))] if len(missing) == 0: continue diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py index 685dc08..a1c8c7e 100644 --- a/tests/rules/test_compare.py +++ b/tests/rules/test_compare.py @@ -9,7 +9,7 @@ [ ( { - "one": list(range(50)) + [42] * 50, + "one": list(range(50)) + ["42"] * 50, "two": list(range(100)), "three": [np.nan] * 50 + list(range(50)), }, @@ -21,7 +21,7 @@ ["one", "two", "three"], { Level.INFO: [ - ("100 `non NaN ones` - 49 new, 51 same",), + ("100 `non NaN ones` - 99 new, 1 same",), ( "50 `ones` are missing", None, @@ -42,7 +42,34 @@ ) ], }, - ) + ), + ( + { + "four": [{i} for i in range(10)] + + [{"k": {"k": i}} for i in range(10)] + + ["l"] * 80 + }, + { + "four": [{i} for i in range(20)] + + [{"k": {"k": i}} for i in range(10)] + + ["l"] * 520 + }, + ["four"], + { + Level.INFO: [ + ("100 `non NaN fours` - 0 new, 100 same",), + ( + "10 `fours` are missing", + None, + { + "{10}, {11}, {12}, {13}, {14}... `fours` are missing": set( + range(10, 20) + ) + }, + ), + ] + }, + ), ], ) def test_fields(source, target, fields, expected): From 4df498ac33c09e0c945fe291fa7c1ef1c9272bb4 Mon Sep 17 00:00:00 2001 From: manycoding Date: Wed, 18 Sep 2019 11:46:45 -0300 Subject: [PATCH 05/10] Wee update --- src/arche/rules/compare.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py index 9af27d6..4e5e608 100644 --- a/src/arche/rules/compare.py +++ b/src/arche/rules/compare.py @@ -5,6 +5,9 @@ import pandas as pd +MAX_MISSING_VALUES = 6 + + def fields( source_df: pd.DataFrame, target_df: pd.DataFrame, @@ -22,8 +25,8 @@ def fields( new = source[~(source.isin(target))] missing = target[~(target.isin(source))] except SystemError: - source = source.apply(str) - target = target.apply(str) + source = source.astype(str) + target = target.astype(str) same = source[source.isin(target)] new = source[~(source.isin(target))] missing = target[~(target.isin(source))] @@ -34,7 +37,7 @@ def fields( if len(missing) == 0: continue - if len(missing) < 6: + if len(missing) < MAX_MISSING_VALUES: msg = ", ".join(missing.unique().astype(str)) else: msg = f"{', '.join(missing.unique()[:5].astype(str))}..." From 10897ceebbf50b10f35474bd98bad0ea029f2d68 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 19 Sep 2019 15:03:02 -0300 Subject: [PATCH 06/10] Add normalization --- src/arche/__init__.py | 1 + src/arche/rules/compare.py | 34 ++++++++++++++++++++++++---------- tests/rules/test_compare.py | 12 +++++++----- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/arche/__init__.py b/src/arche/__init__.py index cf3d18e..109f08f 100755 --- a/src/arche/__init__.py +++ b/src/arche/__init__.py @@ -1,4 +1,5 @@ import logging +from typing import * # noqa __version__ = "0.3.6" SH_URL = "https://app.scrapinghub.com/p" # noqa diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py index 4e5e608..dd7111e 100644 --- a/src/arche/rules/compare.py +++ b/src/arche/rules/compare.py @@ -1,8 +1,5 @@ -from typing import List - from arche.readers.schema import TaggedFields from arche.rules.result import * -import pandas as pd MAX_MISSING_VALUES = 6 @@ -12,24 +9,41 @@ def fields( source_df: pd.DataFrame, target_df: pd.DataFrame, names: List[str], + normalize: bool = False, err_thr: float = 0.25, ) -> Result: - """Return fields values difference between dataframes""" + """Return fields values difference between dataframe. + + Args: + names - a list of field names + normalize - if set, all fields converted to str and processed with lower() and strip() + + Returns: + Result with same, missing and new values. + """ + + def get_difference( + left: pd.Series, right: pd.Series + ) -> (pd.Series, pd.Series, pd.Series): + return ( + left[left.isin(right)], + left[~(left.isin(right))], + right[~(right.isin(left))], + ) result = Result("Fields Difference") for field in names: source = source_df[field].dropna() target = target_df[field].dropna() + if normalize: + source = source.astype(str).str.lower().str.strip() + target = target.astype(str).str.lower().str.strip() try: - same = source[source.isin(target)] - new = source[~(source.isin(target))] - missing = target[~(target.isin(source))] + same, new, missing = get_difference(source, target) except SystemError: source = source.astype(str) target = target.astype(str) - same = source[source.isin(target)] - new = source[~(source.isin(target))] - missing = target[~(target.isin(source))] + same, new, missing = get_difference(source, target) result.add_info( f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py index a1c8c7e..42802c2 100644 --- a/tests/rules/test_compare.py +++ b/tests/rules/test_compare.py @@ -5,7 +5,7 @@ @pytest.mark.parametrize( - ["source", "target", "fields", "expected"], + ["source", "target", "fields", "normalize", "expected"], [ ( { @@ -19,6 +19,7 @@ "three": [np.nan] * 500 + list(range(50)), }, ["one", "two", "three"], + False, { Level.INFO: [ ("100 `non NaN ones` - 99 new, 1 same",), @@ -46,15 +47,16 @@ ( { "four": [{i} for i in range(10)] - + [{"k": {"k": i}} for i in range(10)] + + [{"K": {"k": i}} for i in range(10)] + ["l"] * 80 }, { "four": [{i} for i in range(20)] + [{"k": {"k": i}} for i in range(10)] - + ["l"] * 520 + + ["L"] * 520 }, ["four"], + True, { Level.INFO: [ ("100 `non NaN fours` - 0 new, 100 same",), @@ -72,7 +74,7 @@ ), ], ) -def test_fields(source, target, fields, expected): +def test_fields(source, target, fields, normalize, expected): assert compare.fields( - pd.DataFrame(source), pd.DataFrame(target), fields + pd.DataFrame(source), pd.DataFrame(target), fields, normalize ) == create_result("Fields Difference", expected) From da3b312df898f3da964197374eaaf5c3c2e8d264 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 26 Sep 2019 15:47:26 -0300 Subject: [PATCH 07/10] Add more_stats to easily access all data, replace Result class eq with assert --- src/arche/rules/compare.py | 5 +- src/arche/rules/result.py | 33 ++---------- tests/conftest.py | 63 ++++++++++++++-------- tests/rules/test_compare.py | 102 +++++++++++++++++++++++------------- tests/rules/test_result.py | 84 ----------------------------- tests/test_conftest.py | 30 +++++++++++ 6 files changed, 147 insertions(+), 170 deletions(-) create mode 100644 tests/test_conftest.py diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py index dd7111e..eb4a237 100644 --- a/src/arche/rules/compare.py +++ b/src/arche/rules/compare.py @@ -45,6 +45,10 @@ def get_difference( target = target.astype(str) same, new, missing = get_difference(source, target) + same.name, new.name, missing.name = (None, None, None) + result.more_stats.update( + {f"{field}": {"same": same, "new": new, "missing": missing}} + ) result.add_info( f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" ) @@ -66,7 +70,6 @@ def get_difference( f"{len(missing)} `{field}s` are missing", errors={msg: set(missing.index)}, ) - return result diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py index 172ee46..6eae28f 100755 --- a/src/arche/rules/result.py +++ b/src/arche/rules/result.py @@ -65,35 +65,12 @@ class Result: name: str messages: Dict[Level, List[Message]] = field(default_factory=dict) - _stats: Optional[List[Stat]] = field(default_factory=list) - items_count: Optional[int] = 0 + _stats: List[Stat] = field(default_factory=list) + more_stats: Dict[str, Dict] = field(default_factory=dict) + items_count: int = 0 _err_keys: Set[Union[str, int]] = field(default_factory=set) - _err_items_count: Optional[int] = 0 - _figures: Optional[List[go.FigureWidget]] = field(default_factory=list) - - def __eq__(self, other): - for left, right in zip(self.stats, other.stats): - if not self.tensors_equal(left, right): - return False - - return ( - self.name == other.name - and self.messages == other.messages - and self.items_count == other.items_count - and self.err_items_count == other.err_items_count - and len(self.stats) == len(other.stats) - ) - - @staticmethod - def tensors_equal(left: Stat, right: Stat): - try: - if isinstance(left, pd.DataFrame): - pd.testing.assert_frame_equal(left, right) - else: - pd.testing.assert_series_equal(left, right) - return True - except AssertionError: - return False + _err_items_count: int = 0 + _figures: List[go.FigureWidget] = field(default_factory=list) @property def info(self): diff --git a/tests/conftest.py b/tests/conftest.py index 1c53809..747f032 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ from copy import deepcopy -from itertools import zip_longest -from typing import Dict, Iterable, List, Optional +from functools import partial +from typing import Any, Dict, Iterable, List, Optional from arche.readers.items import CollectionItems, JobItems from arche.rules.result import Level, Message, Result, Stat @@ -209,6 +209,7 @@ def create_result( messages: Dict[Level, List[Message]], stats: Optional[List[Stat]] = None, items_count: Optional[int] = None, + more_stats: Optional[Dict[str, Any]] = None, ) -> Result: result = Result(rule_name) for level, messages in messages.items(): @@ -217,30 +218,50 @@ def create_result( if stats: result.stats = stats + if more_stats: + result.more_stats = more_stats if items_count: result.items_count = items_count return result -def pytest_assertrepr_compare(op, left, right): - if isinstance(left, Result) and isinstance(right, Result) and op == "==": - assert_msgs = ["Results are equal"] - for (left_n, left_v), (_, right_v) in zip_longest( - left.__dict__.items(), right.__dict__.items() - ): - if left_n == "_stats": - for left_stat, right_stat in zip_longest(left_v, right_v): - try: - if isinstance(left_stat, pd.DataFrame): - pd.testing.assert_frame_equal(left_stat, right_stat) - else: - pd.testing.assert_series_equal(left_stat, right_stat) - except AssertionError as e: - assert_msgs.extend([f"{left_stat}", "!=", f"{right_stat}"]) - assert_msgs.extend(str(e).split("\n")) - elif left_v != right_v: - assert_msgs.extend([f"{left_v}", "!=", f"{right_v}"]) - return assert_msgs +def assert_results_equal(left: Result, right: Result, **kwargs): + attrs = [ + "name", + "messages", + "items_count", + "_err_items_count", + "_err_keys", + "_figures", + ] + for attr in attrs: + assert getattr(left, attr) == getattr(right, attr) + assert len(left.stats) == len(right.stats) + + def assert_dicts_equal(left: Dict, right: Dict): + assert left.keys() == right.keys() + assert len(left.items()) == len(right.items()) + for left_v, right_v in zip(left.values(), right.values()): + if isinstance(left_v, dict): + assert_dicts_equal(left_v, right_v) + elif isinstance(left_v, (pd.Series, pd.DataFrame)): + assert_tensors_equal(left_v, right_v, **kwargs) + else: + assert left_v == right_v + + for left_t, right_t in zip(left._stats, right._stats): + assert_tensors_equal(left_t, right_t) + + assert_dicts_equal(left.more_stats, right.more_stats) + + +def assert_tensors_equal(left: Stat, right: Stat, **kwargs): + if isinstance(left, pd.DataFrame): + assert_f = partial(pd.testing.assert_frame_equal, **kwargs) + elif isinstance(left, pd.Series): + assert_f = partial(pd.testing.assert_series_equal, **kwargs) + + assert_f(left, right) def create_named_df(data: Dict, index: List[str], name: str) -> pd.DataFrame: diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py index 42802c2..ff830a3 100644 --- a/tests/rules/test_compare.py +++ b/tests/rules/test_compare.py @@ -5,76 +5,106 @@ @pytest.mark.parametrize( - ["source", "target", "fields", "normalize", "expected"], + ["source", "target", "fields", "normalize", "expected", "more_stats"], [ ( { - "one": list(range(50)) + ["42"] * 50, - "two": list(range(100)), - "three": [np.nan] * 50 + list(range(50)), + "one": list(range(5)) + ["42"] * 5, + "two": list(range(10)), + "three": [np.nan] * 5 + list(range(5)), }, { - "one": list(range(50, 100)) + [42] * 500, - "two": list(range(550)), - "three": [np.nan] * 500 + list(range(50)), + "one": list(range(5, 10)) + [4] * 6, + "two": list(range(11)), + "three": [np.nan] * 10 + [1], }, ["one", "two", "three"], False, { Level.INFO: [ - ("100 `non NaN ones` - 99 new, 1 same",), - ( - "50 `ones` are missing", - None, - {"50, 51, 52, 53, 54... `ones` are missing": set(range(50))}, - ), - ("100 `non NaN twos` - 0 new, 100 same",), - ("50 `non NaN threes` - 0 new, 50 same",), + ("10 `non NaN ones` - 9 new, 1 same",), + ("10 `non NaN twos` - 0 new, 10 same",), + ("1 `twos` are missing", None, {"10 `twos` are missing": {10}}), + ("5 `non NaN threes` - 4 new, 1 same",), ], Level.ERROR: [ ( - "450 `twos` are missing", + "5 `ones` are missing", None, - { - "100, 101, 102, 103, 104... `twos` are missing": set( - range(100, 550) - ) - }, + {"5, 6, 7, 8, 9 `ones` are missing": set(range(5))}, ) ], }, + { + "one": { + "same": pd.Series([4], index=[4], dtype="object"), + "new": pd.Series( + [0, 1, 2, 3] + ["42"] * 5, index=[0, 1, 2, 3, 5, 6, 7, 8, 9] + ), + "missing": pd.Series(list(range(5, 10))), + }, + "two": { + "same": pd.Series(list(range(10))), + "new": pd.Series(dtype=np.int64), + "missing": pd.Series([10], index=[10]), + }, + "three": { + "same": pd.Series([1.0], index=[6]), + "new": pd.Series([0.0, 2.0, 3.0, 4.0], index=[5, 7, 8, 9]), + "missing": pd.Series(), + }, + }, ), ( { - "four": [{i} for i in range(10)] - + [{"K": {"k": i}} for i in range(10)] - + ["l"] * 80 + "four": [{i} for i in range(2)] + + [{"K": {"k": i}} for i in range(2)] + + ["l"] * 6 }, { - "four": [{i} for i in range(20)] - + [{"k": {"k": i}} for i in range(10)] - + ["L"] * 520 + "four": [{i} for i in range(4)] + + [{"k": {"k": i}} for i in range(4)] + + ["L"] * 20 }, ["four"], True, { Level.INFO: [ - ("100 `non NaN fours` - 0 new, 100 same",), + ("10 `non NaN fours` - 0 new, 10 same",), ( - "10 `fours` are missing", + "4 `fours` are missing", None, { - "{10}, {11}, {12}, {13}, {14}... `fours` are missing": set( - range(10, 20) - ) + "{2}, {3}, {'k': {'k': 2}}, {'k': {'k': 3}} `fours` are missing": { + 2, + 3, + 6, + 7, + } }, ), ] }, + { + "four": { + "same": pd.Series( + [str({i}) for i in range(2)] + + [str({"k": {"k": i}}) for i in range(2)] + + ["l"] * 6 + ), + "new": pd.Series(dtype=object), + "missing": pd.Series( + ["{2}", "{3}", "{'k': {'k': 2}}", "{'k': {'k': 3}}"], + index={2, 3, 6, 7}, + ), + } + }, ), ], ) -def test_fields(source, target, fields, normalize, expected): - assert compare.fields( - pd.DataFrame(source), pd.DataFrame(target), fields, normalize - ) == create_result("Fields Difference", expected) +def test_fields(source, target, fields, normalize, expected, more_stats): + assert_results_equal( + compare.fields(pd.DataFrame(source), pd.DataFrame(target), fields, normalize), + create_result("Fields Difference", expected, more_stats=more_stats), + check_index_type=False, + ) diff --git a/tests/rules/test_result.py b/tests/rules/test_result.py index 23ca2b6..c9ebb0b 100755 --- a/tests/rules/test_result.py +++ b/tests/rules/test_result.py @@ -57,34 +57,6 @@ def test_result_err_keys(messages, true_err_keys): assert Result("x", messages=messages).err_keys == true_err_keys -@pytest.mark.parametrize( - "source, target", - [ - ( - pd.Series([0, 1], index=["f", "l"], name="n"), - pd.Series([0, 1], index=["f", "l"], name="n"), - ), - (pd.DataFrame([0, 1]), pd.DataFrame([0, 1])), - ], -) -def test_tensors_equal(source, target): - assert Result.tensors_equal(source, target) - - -@pytest.mark.parametrize( - "source, target", - [ - ( - pd.Series([0, 1], index=["f", "l"], name="s"), - pd.Series([0, 1], index=["f", "l"], name="n"), - ), - (pd.DataFrame([0, 1]), pd.DataFrame([0, 1], index=["m", "s"])), - ], -) -def test_tensors_not_equal(source, target): - assert not Result.tensors_equal(source, target) - - @pytest.mark.parametrize( "message, stats, outputs", [ @@ -111,59 +83,3 @@ def test_show(mocker, capsys, message, stats, outputs): res.show() mock_pio_show.assert_called_once_with(res.figures[0]) mocked_md.assert_has_calls(mocker.call(o) for o in outputs) - - -@pytest.mark.parametrize( - "left_params, right_params", - [ - ( - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="s"), pd.DataFrame({"s": [0]})], - 2, - ["err1"], - 1, - ), - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="s"), pd.DataFrame({"s": [0]})], - 2, - ["err1"], - 1, - ), - ), - (("s",), ("s",)), - ], -) -def test_result_equal(left_params, right_params): - assert Result(*left_params) == Result(*right_params) - - -@pytest.mark.parametrize( - "left_params, right_params", - [ - ( - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="A name"), pd.DataFrame([0])], - 2, - ["err1"], - 1, - ), - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="A series name"), pd.DataFrame([0])], - 2, - ["err1"], - 1, - ), - ), - (("s",), ("t",)), - ], -) -def test_result_not_equal(left_params, right_params): - assert Result(*left_params) != Result(*right_params) diff --git a/tests/test_conftest.py b/tests/test_conftest.py new file mode 100644 index 0000000..2b12c17 --- /dev/null +++ b/tests/test_conftest.py @@ -0,0 +1,30 @@ +from conftest import * + + +@pytest.mark.parametrize( + "source, target", + [ + ( + pd.Series([0, 1], index=["f", "l"], name="n"), + pd.Series([0, 1], index=["f", "l"], name="n"), + ), + (pd.DataFrame([0, 1]), pd.DataFrame([0, 1])), + ], +) +def test_assert_tensors_equal(source, target): + assert_tensors_equal(source, target) + + +@pytest.mark.parametrize( + "source, target", + [ + ( + pd.Series([0, 1], index=["f", "l"], name="s"), + pd.Series([0, 1], index=["f", "l"], name="n"), + ), + (pd.DataFrame([0, 1]), pd.DataFrame([0, 1], index=["m", "s"])), + ], +) +def test_assert_tensors_not_equal(source, target): + with pytest.raises(AssertionError): + assert_tensors_equal(source, target) From 22cd4009fe6fcade0a18923192a09730cbe445c7 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 26 Sep 2019 15:48:14 -0300 Subject: [PATCH 08/10] Update rules to new assert --- tests/rules/test_category.py | 32 ++++++++++++++++++++------------ tests/rules/test_coverage.py | 26 +++++++++++++++----------- tests/rules/test_duplicates.py | 26 ++++++++++++++++---------- tests/rules/test_json_schema.py | 20 +++++++++++++------- tests/rules/test_metadata.py | 23 ++++++++++++++--------- tests/rules/test_others.py | 13 ++++++++----- tests/rules/test_price.py | 22 +++++++++++++++------- 7 files changed, 101 insertions(+), 61 deletions(-) diff --git a/tests/rules/test_category.py b/tests/rules/test_category.py index 462d4c4..9f9963a 100755 --- a/tests/rules/test_category.py +++ b/tests/rules/test_category.py @@ -1,6 +1,6 @@ import arche.rules.category as c from arche.rules.result import Level -from conftest import create_result, create_named_df +from conftest import * import numpy as np import pandas as pd import pytest @@ -21,8 +21,11 @@ ], ) def test_get_coverage_per_category(data, cat_names, expected_messages, expected_stats): - assert c.get_coverage_per_category(pd.DataFrame(data), cat_names) == create_result( - "Coverage For Scraped Categories", expected_messages, expected_stats + assert_results_equal( + c.get_coverage_per_category(pd.DataFrame(data), cat_names), + create_result( + "Coverage For Scraped Categories", expected_messages, expected_stats + ), ) @@ -81,10 +84,11 @@ def test_get_coverage_per_category(data, cat_names, expected_messages, expected_ ], ) def test_get_difference(source, target, categories, expected_messages, expected_stats): - assert c.get_difference( - pd.DataFrame(source), pd.DataFrame(target), categories - ) == create_result( - "Category Coverage Difference", expected_messages, stats=expected_stats + assert_results_equal( + c.get_difference(pd.DataFrame(source), pd.DataFrame(target), categories), + create_result( + "Category Coverage Difference", expected_messages, stats=expected_stats + ), ) @@ -96,8 +100,10 @@ def test_get_difference(source, target, categories, expected_messages, expected_ ], ) def test_get_no_categories(data, expected_message): - result = c.get_categories(pd.DataFrame(data)) - assert result == create_result("Categories", {Level.INFO: [(expected_message,)]}) + assert_results_equal( + c.get_categories(pd.DataFrame(data)), + create_result("Categories", {Level.INFO: [(expected_message,)]}), + ) @pytest.mark.parametrize( @@ -129,9 +135,11 @@ def test_get_no_categories(data, expected_message): ], ) def test_get_categories(data, max_uniques, expected_stats, expected_message): - result = c.get_categories(pd.DataFrame(data), max_uniques) - assert result == create_result( - "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats + assert_results_equal( + c.get_categories(pd.DataFrame(data), max_uniques), + create_result( + "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats + ), ) diff --git a/tests/rules/test_coverage.py b/tests/rules/test_coverage.py index 9b0cf9c..8433452 100755 --- a/tests/rules/test_coverage.py +++ b/tests/rules/test_coverage.py @@ -2,7 +2,7 @@ import arche.rules.coverage as cov from arche.rules.result import Level, Outcome -from conftest import create_result, create_named_df, Job +from conftest import * import pandas as pd import pytest @@ -36,8 +36,10 @@ ], ) def test_check_fields_coverage(df, expected_messages, expected_stats): - result = cov.check_fields_coverage(df) - assert result == create_result("Fields Coverage", expected_messages, expected_stats) + assert_results_equal( + cov.check_fields_coverage(df), + create_result("Fields Coverage", expected_messages, expected_stats), + ) @pytest.mark.parametrize( @@ -114,11 +116,11 @@ def test_check_fields_coverage(df, expected_messages, expected_stats): ], ) def test_get_difference(source_stats, target_stats, expected_messages, expected_stats): - result = cov.get_difference( - Job(stats=source_stats, key="s"), Job(stats=target_stats, key="t") - ) - assert result == create_result( - "Coverage Difference", expected_messages, stats=expected_stats + assert_results_equal( + cov.get_difference( + Job(stats=source_stats, key="s"), Job(stats=target_stats, key="t") + ), + create_result("Coverage Difference", expected_messages, stats=expected_stats), ) @@ -133,7 +135,7 @@ def test_compare_scraped_fields(source_cols, target_cols, expected_messages): result = cov.compare_scraped_fields( pd.DataFrame([], columns=source_cols), pd.DataFrame([], columns=target_cols) ) - assert result == create_result("Scraped Fields", expected_messages) + assert_results_equal(result, create_result("Scraped Fields", expected_messages)) @pytest.mark.parametrize( @@ -191,5 +193,7 @@ def test_anomalies( for key, counts, input_values in jobs_stats ] mocker.patch("arche.rules.coverage.api.get_jobs", return_value=jobs) - result = cov.anomalies(jobs_stats[-1][0], [key for key, *_ in jobs_stats[:-1]]) - assert result == create_result("Anomalies", expected_messages, stats=stats) + assert_results_equal( + cov.anomalies(jobs_stats[-1][0], [key for key, *_ in jobs_stats[:-1]]), + create_result("Anomalies", expected_messages, stats=stats), + ) diff --git a/tests/rules/test_duplicates.py b/tests/rules/test_duplicates.py index 90f6a8b..9dd2e7f 100755 --- a/tests/rules/test_duplicates.py +++ b/tests/rules/test_duplicates.py @@ -1,6 +1,6 @@ import arche.rules.duplicates as duplicates from arche.rules.result import Level, Outcome -from conftest import create_result +from conftest import * import numpy as np import pandas as pd import pytest @@ -45,8 +45,11 @@ @pytest.mark.parametrize("data, tagged_fields, expected_messages", unique_inputs) def test_find_by_unique(data, tagged_fields, expected_messages): df = pd.DataFrame(data) - assert duplicates.find_by_unique(df, tagged_fields) == create_result( - "Duplicates By **unique** Tag", expected_messages, items_count=len(df) + assert_results_equal( + duplicates.find_by_unique(df, tagged_fields), + create_result( + "Duplicates By **unique** Tag", expected_messages, items_count=len(df) + ), ) @@ -80,8 +83,9 @@ def test_find_by_unique(data, tagged_fields, expected_messages): ) def test_find_by(data, columns, expected_messages): df = pd.DataFrame(data) - assert duplicates.find_by(df, columns) == create_result( - "Duplicates", expected_messages, items_count=len(df) + assert_results_equal( + duplicates.find_by(df, columns), + create_result("Duplicates", expected_messages, items_count=len(df)), ) @@ -111,9 +115,11 @@ def test_find_by(data, columns, expected_messages): ) def test_find_by_name_url(data, tagged_fields, expected_messages): df = pd.DataFrame(data) - result = duplicates.find_by_name_url(df, tagged_fields) - assert result == create_result( - "Duplicates By **name_field, product_url_field** Tags", - expected_messages, - items_count=len(df), + assert_results_equal( + duplicates.find_by_name_url(df, tagged_fields), + create_result( + "Duplicates By **name_field, product_url_field** Tags", + expected_messages, + items_count=len(df), + ), ) diff --git a/tests/rules/test_json_schema.py b/tests/rules/test_json_schema.py index da0bdd2..80cd0d6 100755 --- a/tests/rules/test_json_schema.py +++ b/tests/rules/test_json_schema.py @@ -1,6 +1,6 @@ from arche.rules.json_schema import check_tags, validate from arche.rules.result import Level -from conftest import create_result +from conftest import * import pytest @@ -114,8 +114,10 @@ "source_columns, target_columns, tags, expected_messages", tags_inputs ) def test_check_tags(source_columns, target_columns, tags, expected_messages): - result = check_tags(source_columns, target_columns, tags) - assert result == create_result("Tags", expected_messages) + assert_results_equal( + check_tags(source_columns, target_columns, tags), + create_result("Tags", expected_messages), + ) @pytest.mark.parametrize( @@ -147,10 +149,14 @@ def test_check_tags(source_columns, target_columns, tags, expected_messages): ], ) def test_validate(get_raw_items, schema, expected_messages): - result = validate(schema, get_raw_items, range(len(get_raw_items))) - assert result == create_result("JSON Schema Validation", expected_messages) + assert_results_equal( + validate(schema, get_raw_items, range(len(get_raw_items))), + create_result("JSON Schema Validation", expected_messages), + ) def test_validate_passed(get_schema, get_raw_items): - result = validate(get_schema, get_raw_items, range(len(get_raw_items))) - assert result == create_result("JSON Schema Validation", {}) + assert_results_equal( + validate(get_schema, get_raw_items, range(len(get_raw_items))), + create_result("JSON Schema Validation", {}), + ) diff --git a/tests/rules/test_metadata.py b/tests/rules/test_metadata.py index 0f58303..5e8afd7 100755 --- a/tests/rules/test_metadata.py +++ b/tests/rules/test_metadata.py @@ -6,7 +6,7 @@ compare_response_ratio, ) from arche.rules.result import Level -from conftest import create_result, Job +from conftest import * import pytest @@ -32,7 +32,9 @@ def test_check_errors(get_job, error_count, expected_messages): job.metadata = {"scrapystats": error_count} job.key = "112358/13/21" - assert check_errors(job) == create_result("Job Errors", expected_messages) + assert_results_equal( + check_errors(job), create_result("Job Errors", expected_messages) + ) outcome_input = [ @@ -70,8 +72,9 @@ def test_check_outcome(get_job, metadata, expected_messages): job = get_job job.metadata = metadata - result = check_outcome(job) - assert result == create_result("Job Outcome", expected_messages) + assert_results_equal( + check_outcome(job), create_result("Job Outcome", expected_messages) + ) time_inputs = [ @@ -124,8 +127,10 @@ def test_compare_finish_time( source_job.metadata = source_metadata target_job.metadata = target_metadata - result = compare_finish_time(source_job, target_job) - assert result == create_result("Finish Time", expected_messages) + assert_results_equal( + compare_finish_time(source_job, target_job), + create_result("Finish Time", expected_messages), + ) compare_response_ratio_inputs = [ @@ -163,7 +168,7 @@ def test_compare_response_ratio( source_job = Job(stats=source_stats, metadata=source_metadata) target_job = Job(stats=target_stats, metadata=target_metadata) - result = compare_response_ratio(source_job, target_job) - assert result == create_result( - "Compare Responses Per Item Ratio", expected_messages + assert_results_equal( + compare_response_ratio(source_job, target_job), + create_result("Compare Responses Per Item Ratio", expected_messages), ) diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py index 491516b..b9f9b55 100755 --- a/tests/rules/test_others.py +++ b/tests/rules/test_others.py @@ -2,7 +2,7 @@ from arche.rules.others import compare_boolean_fields, garbage_symbols from arche.rules.result import Level, Outcome -from conftest import create_named_df, create_result +from conftest import * import pandas as pd import pytest @@ -64,8 +64,8 @@ def test_compare_boolean_fields( source_df = pd.DataFrame(source_data) target_df = pd.DataFrame(target_data) rule_result = compare_boolean_fields(source_df, target_df) - assert rule_result == create_result( - "Boolean Fields", expected_messages, expected_stats + assert_results_equal( + rule_result, create_result("Boolean Fields", expected_messages, expected_stats) ) @@ -112,6 +112,9 @@ def test_compare_boolean_fields( "raw_items, expected_messages, expected_items_count", dirty_inputs ) def test_garbage_symbols(raw_items, expected_messages, expected_items_count): - assert garbage_symbols(pd.DataFrame(raw_items)) == create_result( - "Garbage Symbols", expected_messages, items_count=expected_items_count + assert_results_equal( + garbage_symbols(pd.DataFrame(raw_items)), + create_result( + "Garbage Symbols", expected_messages, items_count=expected_items_count + ), ) diff --git a/tests/rules/test_price.py b/tests/rules/test_price.py index 237a13c..0f6fbc3 100755 --- a/tests/rules/test_price.py +++ b/tests/rules/test_price.py @@ -1,6 +1,6 @@ import arche.rules.price as p from arche.rules.result import Level, Outcome -from conftest import create_result +from conftest import * import numpy as np import pandas as pd import pytest @@ -51,9 +51,11 @@ @pytest.mark.parametrize("data, tagged_fields, expected_messages", was_now_inputs) def test_compare_was_now(data, tagged_fields, expected_messages): df = pd.DataFrame(data) - result = p.compare_was_now(df, tagged_fields) - assert result == create_result( - "Compare Price Was And Now", expected_messages, items_count=len(df) + assert_results_equal( + p.compare_was_now(df, tagged_fields), + create_result( + "Compare Price Was And Now", expected_messages, items_count=len(df) + ), ) @@ -88,7 +90,9 @@ def test_compare_prices_for_same_urls( result = p.compare_prices_for_same_urls( pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields ) - assert result == create_result("Compare Prices For Same Urls", expected_messages) + assert_results_equal( + result, create_result("Compare Prices For Same Urls", expected_messages) + ) compare_names_inputs = [ @@ -122,7 +126,9 @@ def test_compare_names_for_same_urls( result = p.compare_names_for_same_urls( pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields ) - assert result == create_result("Compare Names Per Url", expected_messages) + assert_results_equal( + result, create_result("Compare Names Per Url", expected_messages) + ) @pytest.mark.parametrize( @@ -152,4 +158,6 @@ def test_compare_prices_for_same_names( result = p.compare_prices_for_same_names( pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields ) - assert result == create_result("Compare Prices For Same Names", expected_messages) + assert_results_equal( + result, create_result("Compare Prices For Same Names", expected_messages) + ) From 96e4aa629b819655d0c5e665e2443e6b1d5bee11 Mon Sep 17 00:00:00 2001 From: manycoding Date: Thu, 26 Sep 2019 15:48:24 -0300 Subject: [PATCH 09/10] Fix tqdm warning --- Pipfile | 1 + src/arche/readers/items.py | 4 ++-- src/arche/rules/category.py | 4 ++-- src/arche/rules/others.py | 6 ++---- src/arche/tools/api.py | 4 ++-- src/arche/tools/schema.py | 10 +++------- 6 files changed, 12 insertions(+), 17 deletions(-) diff --git a/Pipfile b/Pipfile index 1ff4a44..c53a891 100755 --- a/Pipfile +++ b/Pipfile @@ -35,6 +35,7 @@ recommonmark = "*" sphinxcontrib-golangdomain = {git = "https://bitbucket.org/ymotongpoo/sphinxcontrib-golangdomain"} sphinx-autoapi = {git = "https://github.com/rtfd/sphinx-autoapi"} nbsphinx = "*" +sphinx_bootstrap_theme = "*" memory-profiler = "*" jupyter-console = "*" matplotlib = "*" diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index c33ac24..fb9478b 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -8,7 +8,7 @@ import pandas as pd from scrapinghub import ScrapinghubClient from scrapinghub.client.jobs import Job -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm RawItems = Iterable[Dict[str, Any]] @@ -33,7 +33,7 @@ def categorize(df: pd.DataFrame) -> pd.DataFrame: """Cast columns with repeating values to `category` type to save memory""" if len(df) < 100: return - for c in tqdm_notebook(df.columns, desc="Categorizing"): + for c in tqdm(df.columns, desc="Categorizing"): try: if df[c].nunique(dropna=False) <= 10: df[c] = df[c].astype("category") diff --git a/src/arche/rules/category.py b/src/arche/rules/category.py index 5970acb..f4691b1 100755 --- a/src/arche/rules/category.py +++ b/src/arche/rules/category.py @@ -2,7 +2,7 @@ from arche.rules.result import Outcome, Result import pandas as pd -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm def get_difference( @@ -97,7 +97,7 @@ def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result: columns = find_likely_cats(df, max_uniques) result.stats = [ value_counts - for value_counts in tqdm_notebook( + for value_counts in tqdm( map(lambda c: df[c].value_counts(dropna=False), columns), desc="Finding categories", total=len(columns), diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py index 211872f..4851a2f 100755 --- a/src/arche/rules/others.py +++ b/src/arche/rules/others.py @@ -4,7 +4,7 @@ from arche.rules.result import Outcome, Result import numpy as np import pandas as pd -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm def compare_boolean_fields( @@ -93,9 +93,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result: row_keys = set() rule_result = Result("Garbage Symbols", items_count=len(df)) - for column in tqdm_notebook( - df.select_dtypes([np.object]).columns, desc="Garbage Symbols" - ): + for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"): matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE) if not matches.empty: error_keys = df.loc[matches.unstack().index.values].index diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index cd56b2a..c5a30b8 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -10,7 +10,7 @@ import numpy as np from scrapinghub import ScrapinghubClient from scrapinghub.client.jobs import Job -from tqdm import tqdm, tqdm_notebook +from tqdm import tqdm, notebook Filters = List[Tuple[str, str, str]] @@ -163,7 +163,7 @@ def get_items( start_index: int, start: Optional[str], filters: Optional[Filters] = None, - p_bar: Union[tqdm, tqdm_notebook] = tqdm_notebook, + p_bar: Union[tqdm, notebook.tqdm] = notebook.tqdm, desc: Optional[str] = None, ) -> np.ndarray: source = get_source(key) diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index 3b607dc..a61eecc 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -10,7 +10,7 @@ from genson import SchemaBuilder from jsonschema import FormatChecker, validators import pandas as pd -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm def basic_json_schema(data_source: str, items_numbers: List[int] = None) -> Schema: @@ -95,9 +95,7 @@ def fast_validate( errors = defaultdict(set) validate = fastjsonschema.compile(schema) - for i, raw_item in enumerate( - tqdm_notebook(raw_items, desc="Fast Schema Validation") - ): + for i, raw_item in enumerate(tqdm(raw_items, desc="Fast Schema Validation")): raw_item.pop("_type", None) raw_item.pop("_key", None) try: @@ -117,9 +115,7 @@ def full_validate( validator = validators.validator_for(schema)(schema) validator.format_checker = FormatChecker() - for i, raw_item in enumerate( - tqdm_notebook(raw_items, desc="JSON Schema Validation") - ): + for i, raw_item in enumerate(tqdm(raw_items, desc="JSON Schema Validation")): raw_item.pop("_type", None) raw_item.pop("_key", None) for e in validator.iter_errors(raw_item): From 933d05eed7f6daa71e6af8d70bed28c6090ea34b Mon Sep 17 00:00:00 2001 From: manycoding Date: Mon, 30 Sep 2019 12:42:21 -0300 Subject: [PATCH 10/10] Add line to changes, describe err_thr --- CHANGES.md | 1 + src/arche/rules/compare.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 1e0cc67..9b6c73d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,6 +14,7 @@ Note that the top-most release is changes in the unreleased master branch on Git ### Added - **Anomalies** to see significant deviations in fields coverage across multiple jobs, #138 - Support to **Bitbucket API**, in order to access files from private repositories, #71 +- **Fields Difference** rule to find the difference between field values of two jobs. Supports normalization, nested fields, full access to the data, #167 ## [0.3.6] (2019-07-12) diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py index eb4a237..73c2783 100644 --- a/src/arche/rules/compare.py +++ b/src/arche/rules/compare.py @@ -12,11 +12,12 @@ def fields( normalize: bool = False, err_thr: float = 0.25, ) -> Result: - """Return fields values difference between dataframe. + """Finds fields values difference between dataframes. Args: names - a list of field names normalize - if set, all fields converted to str and processed with lower() and strip() + err_thr - sets the failure threshold for missing values Returns: Result with same, missing and new values.