diff --git a/CHANGES.md b/CHANGES.md index 1e0cc67..9b6c73d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -14,6 +14,7 @@ Note that the top-most release is changes in the unreleased master branch on Git ### Added - **Anomalies** to see significant deviations in fields coverage across multiple jobs, #138 - Support to **Bitbucket API**, in order to access files from private repositories, #71 +- **Fields Difference** rule to find the difference between field values of two jobs. Supports normalization, nested fields, full access to the data, #167 ## [0.3.6] (2019-07-12) diff --git a/Pipfile b/Pipfile index 8e1cec6..d5d4bc6 100755 --- a/Pipfile +++ b/Pipfile @@ -35,6 +35,7 @@ recommonmark = "*" sphinxcontrib-golangdomain = {git = "https://bitbucket.org/ymotongpoo/sphinxcontrib-golangdomain"} sphinx-autoapi = {git = "https://github.com/rtfd/sphinx-autoapi"} nbsphinx = "*" +sphinx_bootstrap_theme = "*" memory-profiler = "*" jupyter-console = "*" matplotlib = "*" diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb index 5089f1a..35520ac 100644 --- a/docs/source/nbs/Rules.ipynb +++ b/docs/source/nbs/Rules.ipynb @@ -200,6 +200,32 @@ "arche.rules.category.get_difference(df, target_df, [\"category\"]).show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare\n", + "### Fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "help(arche.rules.compare.fields)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/arche/__init__.py b/src/arche/__init__.py index cf3d18e..109f08f 100755 --- a/src/arche/__init__.py +++ b/src/arche/__init__.py @@ -1,4 +1,5 @@ import logging +from typing import * # noqa __version__ = "0.3.6" SH_URL = "https://app.scrapinghub.com/p" # noqa diff --git a/src/arche/arche.py b/src/arche/arche.py index 2bc55f5..dd00208 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -7,6 +7,7 @@ from arche.readers.schema import Schema, SchemaSource from arche.report import Report import arche.rules.category as category_rules +import arche.rules.compare as compare import arche.rules.coverage as coverage_rules import arche.rules.duplicates as duplicate_rules import arche.rules.json_schema as schema_rules @@ -256,3 +257,11 @@ def compare_with_customized_rules(self, source_items, target_items, tagged_field price_rules.compare_prices_for_same_names, ]: self.save_result(r(source_items.df, target_items.df, tagged_fields)) + self.save_result( + compare.tagged_fields( + source_items.df, + target_items.df, + tagged_fields, + ["product_url_field", "name_field"], + ) + ) diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py index 1a69ce1..93453a9 100755 --- a/src/arche/readers/items.py +++ b/src/arche/readers/items.py @@ -8,7 +8,7 @@ import pandas as pd from scrapinghub import ScrapinghubClient from scrapinghub.client.jobs import Job -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm RawItems = Iterable[Dict[str, Any]] @@ -33,7 +33,7 @@ def categorize(df: pd.DataFrame) -> pd.DataFrame: """Cast columns with repeating values to `category` type to save memory""" if len(df) < 100: return - for c in tqdm_notebook(df.columns, desc="Categorizing"): + for c in tqdm(df.columns, desc="Categorizing"): try: if df[c].nunique(dropna=False) <= 10: df[c] = df[c].astype("category") diff --git a/src/arche/rules/category.py b/src/arche/rules/category.py index 5970acb..f4691b1 100755 --- a/src/arche/rules/category.py +++ b/src/arche/rules/category.py @@ -2,7 +2,7 @@ from arche.rules.result import Outcome, Result import pandas as pd -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm def get_difference( @@ -97,7 +97,7 @@ def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result: columns = find_likely_cats(df, max_uniques) result.stats = [ value_counts - for value_counts in tqdm_notebook( + for value_counts in tqdm( map(lambda c: df[c].value_counts(dropna=False), columns), desc="Finding categories", total=len(columns), diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py new file mode 100644 index 0000000..f3c4082 --- /dev/null +++ b/src/arche/rules/compare.py @@ -0,0 +1,98 @@ +from typing import Tuple + +from arche.readers.schema import TaggedFields +from arche.rules.result import * + + +MAX_MISSING_VALUES = 6 + + +def fields( + source_df: pd.DataFrame, + target_df: pd.DataFrame, + names: List[str], + normalize: bool = False, + err_thr: float = 0.25, +) -> Result: + """Finds fields values difference between dataframes. + + Args: + names - a list of field names + normalize - if set, all fields converted to str and processed with lower() and strip() + err_thr - sets the failure threshold for missing values + + Returns: + Result with same, missing and new values. + """ + + def get_difference( + left: pd.Series, right: pd.Series + ) -> Tuple[pd.Series, pd.Series, pd.Series]: + return ( + left[left.isin(right)], + left[~(left.isin(right))], + right[~(right.isin(left))], + ) + + result = Result("Fields Difference") + for field in names: + source = source_df[field].dropna() + target = target_df[field].dropna() + if normalize: + source = source.astype(str).str.lower().str.strip() + target = target.astype(str).str.lower().str.strip() + try: + same, new, missing = get_difference(source, target) + except SystemError: + source = source.astype(str) + target = target.astype(str) + same, new, missing = get_difference(source, target) + + same.name, new.name, missing.name = (None, None, None) + result.more_stats.update( + {f"{field}": {"same": same, "new": new, "missing": missing}} + ) + result.add_info( + f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same" + ) + if len(missing) == 0: + continue + + if len(missing) < MAX_MISSING_VALUES: + msg = ", ".join(missing.unique().astype(str)) + else: + msg = f"{', '.join(missing.unique()[:5].astype(str))}..." + msg = f"{msg} `{field}s` are missing" + if len(missing) / len(target_df) >= err_thr: + result.add_error( + f"{len(missing)} `{field}s` are missing", + errors={msg: set(missing.index)}, + ) + else: + result.add_info( + f"{len(missing)} `{field}s` are missing", + errors={msg: set(missing.index)}, + ) + return result + + +def tagged_fields( + source_df: pd.DataFrame, + target_df: pd.DataFrame, + tagged_fields: TaggedFields, + tags: List[str], +) -> Result: + """Compare fields tagged with `tags` between two dataframes.""" + name = f"{', '.join(tags)} Fields Difference" + result = Result(name) + fields_names: List[str] = list() + for tag in tags: + tag_fields = tagged_fields.get(tag) + if tag_fields: + fields_names.extend(tag_fields) + if not fields_names: + result.add_info(Outcome.SKIPPED) + return result + result = fields(source_df, target_df, fields_names) + result.name = name + return result diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py index 81453b4..1710df0 100755 --- a/src/arche/rules/others.py +++ b/src/arche/rules/others.py @@ -5,7 +5,7 @@ from arche.rules.result import Outcome, Result import numpy as np import pandas as pd -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm def compare_boolean_fields( @@ -94,9 +94,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result: row_keys: Set = set() rule_result = Result("Garbage Symbols", items_count=len(df)) - for column in tqdm_notebook( - df.select_dtypes([np.object]).columns, desc="Garbage Symbols" - ): + for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"): matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE) if not matches.empty: error_keys = df.loc[matches.unstack().index.values].index diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py index 7835c0d..5c2fb2e 100755 --- a/src/arche/rules/price.py +++ b/src/arche/rules/price.py @@ -67,14 +67,13 @@ def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields): def compare_prices_for_same_urls( source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields -): +) -> Result: """For each pair of items that have the same `product_url_field` tagged field, compare `product_price_field` field Returns: - A result containing pairs of items with same `product_url_field` - from `source_df` and `target_df` which `product_price_field` differ, - missing and new `product_url_field` tagged fields. + A result containing pairs of items from `source_df` and `target_df` + which `product_price_field` differ. """ result = Result("Compare Prices For Same Urls") url_field_list: Optional[List[str]] = tagged_fields.get("product_url_field") @@ -90,31 +89,12 @@ def compare_prices_for_same_urls( same_urls = source_df[(source_df[url_field].isin(target_df[url_field].values))][ url_field ] - new_urls = source_df[~(source_df[url_field].isin(target_df[url_field].values))][ - url_field - ] - missing_urls = target_df[(~target_df[url_field].isin(source_df[url_field].values))][ - url_field - ] - errors = {} - for url, group in missing_urls.groupby(missing_urls): - errors[f"Missing {url}"] = set(group.index) - - if not missing_urls.empty: - result.add_info( - f"{len(missing_urls)} urls missing from the tested job", errors=errors - ) - if not new_urls.empty: - result.add_info(f"{len(new_urls)} new urls in the tested job") - result.add_info(f"{len(same_urls)} same urls in both jobs") - - diff_prices_count = 0 - price_field_tag = tagged_fields.get("product_price_field") - if not price_field_tag: + price_fields = tagged_fields.get("product_price_field") + if not price_fields: result.add_info("product_price_field tag is not set") else: - price_field = price_field_tag[0] + price_field = price_fields[0] detailed_messages = [] for url in same_urls: if url.strip() != "nan": @@ -130,7 +110,6 @@ def compare_prices_for_same_urls( and is_number(target_price) and ratio_diff(source_price, target_price) > 0.1 ): - diff_prices_count += 1 source_key = source_df[source_df[url_field] == url].index[0] target_key = target_df[target_df[url_field] == url].index[0] msg = ( @@ -139,7 +118,7 @@ def compare_prices_for_same_urls( ) detailed_messages.append(msg) - res = f"{len(same_urls)} checked, {diff_prices_count} errors" + res = f"{len(same_urls)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(res, detailed="\n".join(detailed_messages)) else: @@ -214,33 +193,12 @@ def compare_prices_for_same_names( same_names = source_df[(source_df[name_field].isin(target_df[name_field].values))][ name_field ] - new_names = source_df[~(source_df[name_field].isin(target_df[name_field].values))][ - name_field - ] - missing_names = target_df[ - ~(target_df[name_field].isin(source_df[name_field].values)) - ][name_field] - - errors = {} - for name, group in missing_names.groupby(missing_names): - errors[f"Missing {name}"] = set(group.index) - - if not missing_names.empty: - result.add_info( - f"{len(missing_names)} names missing from the tested job", errors=errors - ) - if not new_names.empty: - result.add_info(f"{len(new_names)} new names in the tested job") - result.add_info(f"{len(same_names)} same names in both jobs") - price_tag = "product_price_field" - price_field_tag = tagged_fields.get(price_tag) - if not price_field_tag: + price_fields = tagged_fields.get("product_price_field") + if not price_fields: result.add_info("product_price_field tag is not set") return result - - price_field = price_field_tag[0] - count = 0 + price_field = price_fields[0] detailed_messages = [] for name in same_names: @@ -249,7 +207,6 @@ def compare_prices_for_same_names( target_price = target_df[target_df[name_field] == name][price_field].iloc[0] if is_number(source_price) and is_number(target_price): if ratio_diff(source_price, target_price) > 0.1: - count += 1 source_key = source_df[source_df[name_field] == name].index[0] target_key = target_df[target_df[name_field] == name].index[0] msg = ( @@ -258,7 +215,7 @@ def compare_prices_for_same_names( ) detailed_messages.append(msg) - result_msg = f"{len(same_names)} checked, {count} errors" + result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors" if detailed_messages: result.add_error(result_msg, detailed="\n".join(detailed_messages)) else: diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py index 87c5282..da0512e 100755 --- a/src/arche/rules/result.py +++ b/src/arche/rules/result.py @@ -65,35 +65,12 @@ class Result: name: str messages: Dict[Level, List[Message]] = field(default_factory=dict) - _stats: Optional[List[Stat]] = field(default_factory=list) - items_count: Optional[int] = 0 + _stats: List[Stat] = field(default_factory=list) + more_stats: Dict[str, Dict] = field(default_factory=dict) + items_count: int = 0 _err_keys: Set[Union[str, int]] = field(default_factory=set) - _err_items_count: Optional[int] = 0 - _figures: Optional[List[go.FigureWidget]] = field(default_factory=list) - - def __eq__(self, other): - for left, right in zip(self.stats, other.stats): - if not self.tensors_equal(left, right): - return False - - return ( - self.name == other.name - and self.messages == other.messages - and self.items_count == other.items_count - and self.err_items_count == other.err_items_count - and len(self.stats) == len(other.stats) - ) - - @staticmethod - def tensors_equal(left: Stat, right: Stat): - try: - if isinstance(left, pd.DataFrame): - pd.testing.assert_frame_equal(left, right) - else: - pd.testing.assert_series_equal(left, right) - return True - except AssertionError: - return False + _err_items_count: int = 0 + _figures: List[go.FigureWidget] = field(default_factory=list) @property def info(self): diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py index db19984..f5ee638 100755 --- a/src/arche/tools/api.py +++ b/src/arche/tools/api.py @@ -10,7 +10,7 @@ import numpy as np from scrapinghub import ScrapinghubClient from scrapinghub.client.jobs import Job -from tqdm import tqdm, tqdm_notebook +from tqdm import tqdm, notebook Filters = List[Tuple[str, str, str]] @@ -166,7 +166,7 @@ def get_items( start_index: int, start: Optional[str], filters: Optional[Filters] = None, - p_bar: Union[tqdm, tqdm_notebook] = tqdm_notebook, + p_bar: Union[tqdm, notebook.tqdm] = notebook.tqdm, desc: Optional[str] = None, ) -> np.ndarray: source = get_source(key) diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py index dc0c09a..20ac91b 100755 --- a/src/arche/tools/schema.py +++ b/src/arche/tools/schema.py @@ -10,7 +10,7 @@ from genson import SchemaBuilder from jsonschema import FormatChecker, validators import pandas as pd -from tqdm import tqdm_notebook +from tqdm.notebook import tqdm def basic_json_schema(data_source: str, items_numbers: List[int] = None) -> Schema: @@ -95,9 +95,7 @@ def fast_validate( errors: DefaultDict = defaultdict(set) validate = fastjsonschema.compile(schema) - for i, raw_item in enumerate( - tqdm_notebook(raw_items, desc="Fast Schema Validation") - ): + for i, raw_item in enumerate(tqdm(raw_items, desc="Fast Schema Validation")): raw_item.pop("_type", None) raw_item.pop("_key", None) try: @@ -117,9 +115,7 @@ def full_validate( validator = validators.validator_for(schema)(schema) validator.format_checker = FormatChecker() - for i, raw_item in enumerate( - tqdm_notebook(raw_items, desc="JSON Schema Validation") - ): + for i, raw_item in enumerate(tqdm(raw_items, desc="JSON Schema Validation")): raw_item.pop("_type", None) raw_item.pop("_key", None) for e in validator.iter_errors(raw_item): diff --git a/tests/conftest.py b/tests/conftest.py index f858d53..269cb62 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ from copy import deepcopy -from itertools import zip_longest -from typing import Dict, List, Optional, Tuple +from functools import partial +from typing import Any, Dict, List, Optional, Tuple from arche.readers.items import CollectionItems, JobItems from arche.rules.result import Level, Result, Stat @@ -209,6 +209,7 @@ def create_result( messages: Dict[Level, List[Tuple]], stats: Optional[List[Stat]] = None, items_count: Optional[int] = None, + more_stats: Optional[Dict[str, Any]] = None, ) -> Result: result = Result(rule_name) for level, messages_list in messages.items(): @@ -217,30 +218,50 @@ def create_result( if stats: result.stats = stats + if more_stats: + result.more_stats = more_stats if items_count: result.items_count = items_count return result -def pytest_assertrepr_compare(op, left, right): - if isinstance(left, Result) and isinstance(right, Result) and op == "==": - assert_msgs = ["Results are equal"] - for (left_n, left_v), (_, right_v) in zip_longest( - left.__dict__.items(), right.__dict__.items() - ): - if left_n == "_stats": - for left_stat, right_stat in zip_longest(left_v, right_v): - try: - if isinstance(left_stat, pd.DataFrame): - pd.testing.assert_frame_equal(left_stat, right_stat) - else: - pd.testing.assert_series_equal(left_stat, right_stat) - except AssertionError as e: - assert_msgs.extend([f"{left_stat}", "!=", f"{right_stat}"]) - assert_msgs.extend(str(e).split("\n")) - elif left_v != right_v: - assert_msgs.extend([f"{left_v}", "!=", f"{right_v}"]) - return assert_msgs +def assert_results_equal(left: Result, right: Result, **kwargs): + attrs = [ + "name", + "messages", + "items_count", + "_err_items_count", + "_err_keys", + "_figures", + ] + for attr in attrs: + assert getattr(left, attr) == getattr(right, attr) + assert len(left.stats) == len(right.stats) + + def assert_dicts_equal(left: Dict, right: Dict): + assert left.keys() == right.keys() + assert len(left.items()) == len(right.items()) + for left_v, right_v in zip(left.values(), right.values()): + if isinstance(left_v, dict): + assert_dicts_equal(left_v, right_v) + elif isinstance(left_v, (pd.Series, pd.DataFrame)): + assert_tensors_equal(left_v, right_v, **kwargs) + else: + assert left_v == right_v + + for left_t, right_t in zip(left._stats, right._stats): + assert_tensors_equal(left_t, right_t) + + assert_dicts_equal(left.more_stats, right.more_stats) + + +def assert_tensors_equal(left: Stat, right: Stat, **kwargs): + if isinstance(left, pd.DataFrame): + assert_f = partial(pd.testing.assert_frame_equal, **kwargs) + elif isinstance(left, pd.Series): + assert_f = partial(pd.testing.assert_series_equal, **kwargs) + + assert_f(left, right) def create_named_df(data: Dict, index: List[str], name: str) -> pd.DataFrame: diff --git a/tests/rules/test_category.py b/tests/rules/test_category.py index 462d4c4..9f9963a 100755 --- a/tests/rules/test_category.py +++ b/tests/rules/test_category.py @@ -1,6 +1,6 @@ import arche.rules.category as c from arche.rules.result import Level -from conftest import create_result, create_named_df +from conftest import * import numpy as np import pandas as pd import pytest @@ -21,8 +21,11 @@ ], ) def test_get_coverage_per_category(data, cat_names, expected_messages, expected_stats): - assert c.get_coverage_per_category(pd.DataFrame(data), cat_names) == create_result( - "Coverage For Scraped Categories", expected_messages, expected_stats + assert_results_equal( + c.get_coverage_per_category(pd.DataFrame(data), cat_names), + create_result( + "Coverage For Scraped Categories", expected_messages, expected_stats + ), ) @@ -81,10 +84,11 @@ def test_get_coverage_per_category(data, cat_names, expected_messages, expected_ ], ) def test_get_difference(source, target, categories, expected_messages, expected_stats): - assert c.get_difference( - pd.DataFrame(source), pd.DataFrame(target), categories - ) == create_result( - "Category Coverage Difference", expected_messages, stats=expected_stats + assert_results_equal( + c.get_difference(pd.DataFrame(source), pd.DataFrame(target), categories), + create_result( + "Category Coverage Difference", expected_messages, stats=expected_stats + ), ) @@ -96,8 +100,10 @@ def test_get_difference(source, target, categories, expected_messages, expected_ ], ) def test_get_no_categories(data, expected_message): - result = c.get_categories(pd.DataFrame(data)) - assert result == create_result("Categories", {Level.INFO: [(expected_message,)]}) + assert_results_equal( + c.get_categories(pd.DataFrame(data)), + create_result("Categories", {Level.INFO: [(expected_message,)]}), + ) @pytest.mark.parametrize( @@ -129,9 +135,11 @@ def test_get_no_categories(data, expected_message): ], ) def test_get_categories(data, max_uniques, expected_stats, expected_message): - result = c.get_categories(pd.DataFrame(data), max_uniques) - assert result == create_result( - "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats + assert_results_equal( + c.get_categories(pd.DataFrame(data), max_uniques), + create_result( + "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats + ), ) diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py new file mode 100644 index 0000000..ff830a3 --- /dev/null +++ b/tests/rules/test_compare.py @@ -0,0 +1,110 @@ +import arche.rules.compare as compare +from arche.rules.result import Level +from conftest import * +import pytest + + +@pytest.mark.parametrize( + ["source", "target", "fields", "normalize", "expected", "more_stats"], + [ + ( + { + "one": list(range(5)) + ["42"] * 5, + "two": list(range(10)), + "three": [np.nan] * 5 + list(range(5)), + }, + { + "one": list(range(5, 10)) + [4] * 6, + "two": list(range(11)), + "three": [np.nan] * 10 + [1], + }, + ["one", "two", "three"], + False, + { + Level.INFO: [ + ("10 `non NaN ones` - 9 new, 1 same",), + ("10 `non NaN twos` - 0 new, 10 same",), + ("1 `twos` are missing", None, {"10 `twos` are missing": {10}}), + ("5 `non NaN threes` - 4 new, 1 same",), + ], + Level.ERROR: [ + ( + "5 `ones` are missing", + None, + {"5, 6, 7, 8, 9 `ones` are missing": set(range(5))}, + ) + ], + }, + { + "one": { + "same": pd.Series([4], index=[4], dtype="object"), + "new": pd.Series( + [0, 1, 2, 3] + ["42"] * 5, index=[0, 1, 2, 3, 5, 6, 7, 8, 9] + ), + "missing": pd.Series(list(range(5, 10))), + }, + "two": { + "same": pd.Series(list(range(10))), + "new": pd.Series(dtype=np.int64), + "missing": pd.Series([10], index=[10]), + }, + "three": { + "same": pd.Series([1.0], index=[6]), + "new": pd.Series([0.0, 2.0, 3.0, 4.0], index=[5, 7, 8, 9]), + "missing": pd.Series(), + }, + }, + ), + ( + { + "four": [{i} for i in range(2)] + + [{"K": {"k": i}} for i in range(2)] + + ["l"] * 6 + }, + { + "four": [{i} for i in range(4)] + + [{"k": {"k": i}} for i in range(4)] + + ["L"] * 20 + }, + ["four"], + True, + { + Level.INFO: [ + ("10 `non NaN fours` - 0 new, 10 same",), + ( + "4 `fours` are missing", + None, + { + "{2}, {3}, {'k': {'k': 2}}, {'k': {'k': 3}} `fours` are missing": { + 2, + 3, + 6, + 7, + } + }, + ), + ] + }, + { + "four": { + "same": pd.Series( + [str({i}) for i in range(2)] + + [str({"k": {"k": i}}) for i in range(2)] + + ["l"] * 6 + ), + "new": pd.Series(dtype=object), + "missing": pd.Series( + ["{2}", "{3}", "{'k': {'k': 2}}", "{'k': {'k': 3}}"], + index={2, 3, 6, 7}, + ), + } + }, + ), + ], +) +def test_fields(source, target, fields, normalize, expected, more_stats): + assert_results_equal( + compare.fields(pd.DataFrame(source), pd.DataFrame(target), fields, normalize), + create_result("Fields Difference", expected, more_stats=more_stats), + check_index_type=False, + ) diff --git a/tests/rules/test_coverage.py b/tests/rules/test_coverage.py index 9b0cf9c..8433452 100755 --- a/tests/rules/test_coverage.py +++ b/tests/rules/test_coverage.py @@ -2,7 +2,7 @@ import arche.rules.coverage as cov from arche.rules.result import Level, Outcome -from conftest import create_result, create_named_df, Job +from conftest import * import pandas as pd import pytest @@ -36,8 +36,10 @@ ], ) def test_check_fields_coverage(df, expected_messages, expected_stats): - result = cov.check_fields_coverage(df) - assert result == create_result("Fields Coverage", expected_messages, expected_stats) + assert_results_equal( + cov.check_fields_coverage(df), + create_result("Fields Coverage", expected_messages, expected_stats), + ) @pytest.mark.parametrize( @@ -114,11 +116,11 @@ def test_check_fields_coverage(df, expected_messages, expected_stats): ], ) def test_get_difference(source_stats, target_stats, expected_messages, expected_stats): - result = cov.get_difference( - Job(stats=source_stats, key="s"), Job(stats=target_stats, key="t") - ) - assert result == create_result( - "Coverage Difference", expected_messages, stats=expected_stats + assert_results_equal( + cov.get_difference( + Job(stats=source_stats, key="s"), Job(stats=target_stats, key="t") + ), + create_result("Coverage Difference", expected_messages, stats=expected_stats), ) @@ -133,7 +135,7 @@ def test_compare_scraped_fields(source_cols, target_cols, expected_messages): result = cov.compare_scraped_fields( pd.DataFrame([], columns=source_cols), pd.DataFrame([], columns=target_cols) ) - assert result == create_result("Scraped Fields", expected_messages) + assert_results_equal(result, create_result("Scraped Fields", expected_messages)) @pytest.mark.parametrize( @@ -191,5 +193,7 @@ def test_anomalies( for key, counts, input_values in jobs_stats ] mocker.patch("arche.rules.coverage.api.get_jobs", return_value=jobs) - result = cov.anomalies(jobs_stats[-1][0], [key for key, *_ in jobs_stats[:-1]]) - assert result == create_result("Anomalies", expected_messages, stats=stats) + assert_results_equal( + cov.anomalies(jobs_stats[-1][0], [key for key, *_ in jobs_stats[:-1]]), + create_result("Anomalies", expected_messages, stats=stats), + ) diff --git a/tests/rules/test_duplicates.py b/tests/rules/test_duplicates.py index 90f6a8b..9dd2e7f 100755 --- a/tests/rules/test_duplicates.py +++ b/tests/rules/test_duplicates.py @@ -1,6 +1,6 @@ import arche.rules.duplicates as duplicates from arche.rules.result import Level, Outcome -from conftest import create_result +from conftest import * import numpy as np import pandas as pd import pytest @@ -45,8 +45,11 @@ @pytest.mark.parametrize("data, tagged_fields, expected_messages", unique_inputs) def test_find_by_unique(data, tagged_fields, expected_messages): df = pd.DataFrame(data) - assert duplicates.find_by_unique(df, tagged_fields) == create_result( - "Duplicates By **unique** Tag", expected_messages, items_count=len(df) + assert_results_equal( + duplicates.find_by_unique(df, tagged_fields), + create_result( + "Duplicates By **unique** Tag", expected_messages, items_count=len(df) + ), ) @@ -80,8 +83,9 @@ def test_find_by_unique(data, tagged_fields, expected_messages): ) def test_find_by(data, columns, expected_messages): df = pd.DataFrame(data) - assert duplicates.find_by(df, columns) == create_result( - "Duplicates", expected_messages, items_count=len(df) + assert_results_equal( + duplicates.find_by(df, columns), + create_result("Duplicates", expected_messages, items_count=len(df)), ) @@ -111,9 +115,11 @@ def test_find_by(data, columns, expected_messages): ) def test_find_by_name_url(data, tagged_fields, expected_messages): df = pd.DataFrame(data) - result = duplicates.find_by_name_url(df, tagged_fields) - assert result == create_result( - "Duplicates By **name_field, product_url_field** Tags", - expected_messages, - items_count=len(df), + assert_results_equal( + duplicates.find_by_name_url(df, tagged_fields), + create_result( + "Duplicates By **name_field, product_url_field** Tags", + expected_messages, + items_count=len(df), + ), ) diff --git a/tests/rules/test_json_schema.py b/tests/rules/test_json_schema.py index da0bdd2..80cd0d6 100755 --- a/tests/rules/test_json_schema.py +++ b/tests/rules/test_json_schema.py @@ -1,6 +1,6 @@ from arche.rules.json_schema import check_tags, validate from arche.rules.result import Level -from conftest import create_result +from conftest import * import pytest @@ -114,8 +114,10 @@ "source_columns, target_columns, tags, expected_messages", tags_inputs ) def test_check_tags(source_columns, target_columns, tags, expected_messages): - result = check_tags(source_columns, target_columns, tags) - assert result == create_result("Tags", expected_messages) + assert_results_equal( + check_tags(source_columns, target_columns, tags), + create_result("Tags", expected_messages), + ) @pytest.mark.parametrize( @@ -147,10 +149,14 @@ def test_check_tags(source_columns, target_columns, tags, expected_messages): ], ) def test_validate(get_raw_items, schema, expected_messages): - result = validate(schema, get_raw_items, range(len(get_raw_items))) - assert result == create_result("JSON Schema Validation", expected_messages) + assert_results_equal( + validate(schema, get_raw_items, range(len(get_raw_items))), + create_result("JSON Schema Validation", expected_messages), + ) def test_validate_passed(get_schema, get_raw_items): - result = validate(get_schema, get_raw_items, range(len(get_raw_items))) - assert result == create_result("JSON Schema Validation", {}) + assert_results_equal( + validate(get_schema, get_raw_items, range(len(get_raw_items))), + create_result("JSON Schema Validation", {}), + ) diff --git a/tests/rules/test_metadata.py b/tests/rules/test_metadata.py index 0f58303..5e8afd7 100755 --- a/tests/rules/test_metadata.py +++ b/tests/rules/test_metadata.py @@ -6,7 +6,7 @@ compare_response_ratio, ) from arche.rules.result import Level -from conftest import create_result, Job +from conftest import * import pytest @@ -32,7 +32,9 @@ def test_check_errors(get_job, error_count, expected_messages): job.metadata = {"scrapystats": error_count} job.key = "112358/13/21" - assert check_errors(job) == create_result("Job Errors", expected_messages) + assert_results_equal( + check_errors(job), create_result("Job Errors", expected_messages) + ) outcome_input = [ @@ -70,8 +72,9 @@ def test_check_outcome(get_job, metadata, expected_messages): job = get_job job.metadata = metadata - result = check_outcome(job) - assert result == create_result("Job Outcome", expected_messages) + assert_results_equal( + check_outcome(job), create_result("Job Outcome", expected_messages) + ) time_inputs = [ @@ -124,8 +127,10 @@ def test_compare_finish_time( source_job.metadata = source_metadata target_job.metadata = target_metadata - result = compare_finish_time(source_job, target_job) - assert result == create_result("Finish Time", expected_messages) + assert_results_equal( + compare_finish_time(source_job, target_job), + create_result("Finish Time", expected_messages), + ) compare_response_ratio_inputs = [ @@ -163,7 +168,7 @@ def test_compare_response_ratio( source_job = Job(stats=source_stats, metadata=source_metadata) target_job = Job(stats=target_stats, metadata=target_metadata) - result = compare_response_ratio(source_job, target_job) - assert result == create_result( - "Compare Responses Per Item Ratio", expected_messages + assert_results_equal( + compare_response_ratio(source_job, target_job), + create_result("Compare Responses Per Item Ratio", expected_messages), ) diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py index 491516b..b9f9b55 100755 --- a/tests/rules/test_others.py +++ b/tests/rules/test_others.py @@ -2,7 +2,7 @@ from arche.rules.others import compare_boolean_fields, garbage_symbols from arche.rules.result import Level, Outcome -from conftest import create_named_df, create_result +from conftest import * import pandas as pd import pytest @@ -64,8 +64,8 @@ def test_compare_boolean_fields( source_df = pd.DataFrame(source_data) target_df = pd.DataFrame(target_data) rule_result = compare_boolean_fields(source_df, target_df) - assert rule_result == create_result( - "Boolean Fields", expected_messages, expected_stats + assert_results_equal( + rule_result, create_result("Boolean Fields", expected_messages, expected_stats) ) @@ -112,6 +112,9 @@ def test_compare_boolean_fields( "raw_items, expected_messages, expected_items_count", dirty_inputs ) def test_garbage_symbols(raw_items, expected_messages, expected_items_count): - assert garbage_symbols(pd.DataFrame(raw_items)) == create_result( - "Garbage Symbols", expected_messages, items_count=expected_items_count + assert_results_equal( + garbage_symbols(pd.DataFrame(raw_items)), + create_result( + "Garbage Symbols", expected_messages, items_count=expected_items_count + ), ) diff --git a/tests/rules/test_price.py b/tests/rules/test_price.py index b6dca99..0f6fbc3 100755 --- a/tests/rules/test_price.py +++ b/tests/rules/test_price.py @@ -1,6 +1,6 @@ import arche.rules.price as p from arche.rules.result import Level, Outcome -from conftest import create_result +from conftest import * import numpy as np import pandas as pd import pytest @@ -51,9 +51,11 @@ @pytest.mark.parametrize("data, tagged_fields, expected_messages", was_now_inputs) def test_compare_was_now(data, tagged_fields, expected_messages): df = pd.DataFrame(data) - result = p.compare_was_now(df, tagged_fields) - assert result == create_result( - "Compare Price Was And Now", expected_messages, items_count=len(df) + assert_results_equal( + p.compare_was_now(df, tagged_fields), + create_result( + "Compare Price Was And Now", expected_messages, items_count=len(df) + ), ) @@ -63,7 +65,6 @@ def test_compare_was_now(data, tagged_fields, expected_messages): {"price": [1.15, "2.3", 6], "url": ["http://1", "http://2", np.nan]}, {"product_price_field": ["price"], "product_url_field": ["url"]}, { - Level.INFO: [("2 same urls in both jobs",)], Level.ERROR: [ ( "2 checked, 2 errors", @@ -74,7 +75,7 @@ def test_compare_was_now(data, tagged_fields, expected_messages): "target price is 1.15 for 0" ), ) - ], + ] }, ) ] @@ -89,7 +90,9 @@ def test_compare_prices_for_same_urls( result = p.compare_prices_for_same_urls( pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields ) - assert result == create_result("Compare Prices For Same Urls", expected_messages) + assert_results_equal( + result, create_result("Compare Prices For Same Urls", expected_messages) + ) compare_names_inputs = [ @@ -123,7 +126,9 @@ def test_compare_names_for_same_urls( result = p.compare_names_for_same_urls( pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields ) - assert result == create_result("Compare Names Per Url", expected_messages) + assert_results_equal( + result, create_result("Compare Names Per Url", expected_messages) + ) @pytest.mark.parametrize( @@ -134,15 +139,6 @@ def test_compare_names_for_same_urls( {"name": ["Coffee", "Tea", "Wine"], "price": [4.0, 4.8, 20.0]}, {"name_field": ["name"], "product_price_field": ["price"]}, { - Level.INFO: [ - ( - "1 names missing from the tested job", - None, - {"Missing Wine": {2}}, - ), - ("1 new names in the tested job",), - ("2 same names in both jobs",), - ], Level.ERROR: [ ( "2 checked, 1 errors", @@ -151,7 +147,7 @@ def test_compare_names_for_same_urls( "target price is 4.0 for 0" ), ) - ], + ] }, ) ], @@ -162,4 +158,6 @@ def test_compare_prices_for_same_names( result = p.compare_prices_for_same_names( pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields ) - assert result == create_result("Compare Prices For Same Names", expected_messages) + assert_results_equal( + result, create_result("Compare Prices For Same Names", expected_messages) + ) diff --git a/tests/rules/test_result.py b/tests/rules/test_result.py index 23ca2b6..c9ebb0b 100755 --- a/tests/rules/test_result.py +++ b/tests/rules/test_result.py @@ -57,34 +57,6 @@ def test_result_err_keys(messages, true_err_keys): assert Result("x", messages=messages).err_keys == true_err_keys -@pytest.mark.parametrize( - "source, target", - [ - ( - pd.Series([0, 1], index=["f", "l"], name="n"), - pd.Series([0, 1], index=["f", "l"], name="n"), - ), - (pd.DataFrame([0, 1]), pd.DataFrame([0, 1])), - ], -) -def test_tensors_equal(source, target): - assert Result.tensors_equal(source, target) - - -@pytest.mark.parametrize( - "source, target", - [ - ( - pd.Series([0, 1], index=["f", "l"], name="s"), - pd.Series([0, 1], index=["f", "l"], name="n"), - ), - (pd.DataFrame([0, 1]), pd.DataFrame([0, 1], index=["m", "s"])), - ], -) -def test_tensors_not_equal(source, target): - assert not Result.tensors_equal(source, target) - - @pytest.mark.parametrize( "message, stats, outputs", [ @@ -111,59 +83,3 @@ def test_show(mocker, capsys, message, stats, outputs): res.show() mock_pio_show.assert_called_once_with(res.figures[0]) mocked_md.assert_has_calls(mocker.call(o) for o in outputs) - - -@pytest.mark.parametrize( - "left_params, right_params", - [ - ( - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="s"), pd.DataFrame({"s": [0]})], - 2, - ["err1"], - 1, - ), - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="s"), pd.DataFrame({"s": [0]})], - 2, - ["err1"], - 1, - ), - ), - (("s",), ("s",)), - ], -) -def test_result_equal(left_params, right_params): - assert Result(*left_params) == Result(*right_params) - - -@pytest.mark.parametrize( - "left_params, right_params", - [ - ( - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="A name"), pd.DataFrame([0])], - 2, - ["err1"], - 1, - ), - ( - "s", - {Level.INFO: ["sum", "det", {"err1": [0, 1]}]}, - [pd.Series([0], name="A series name"), pd.DataFrame([0])], - 2, - ["err1"], - 1, - ), - ), - (("s",), ("t",)), - ], -) -def test_result_not_equal(left_params, right_params): - assert Result(*left_params) != Result(*right_params) diff --git a/tests/test_conftest.py b/tests/test_conftest.py new file mode 100644 index 0000000..2b12c17 --- /dev/null +++ b/tests/test_conftest.py @@ -0,0 +1,30 @@ +from conftest import * + + +@pytest.mark.parametrize( + "source, target", + [ + ( + pd.Series([0, 1], index=["f", "l"], name="n"), + pd.Series([0, 1], index=["f", "l"], name="n"), + ), + (pd.DataFrame([0, 1]), pd.DataFrame([0, 1])), + ], +) +def test_assert_tensors_equal(source, target): + assert_tensors_equal(source, target) + + +@pytest.mark.parametrize( + "source, target", + [ + ( + pd.Series([0, 1], index=["f", "l"], name="s"), + pd.Series([0, 1], index=["f", "l"], name="n"), + ), + (pd.DataFrame([0, 1]), pd.DataFrame([0, 1], index=["m", "s"])), + ], +) +def test_assert_tensors_not_equal(source, target): + with pytest.raises(AssertionError): + assert_tensors_equal(source, target) diff --git a/tox.ini b/tox.ini index 256cfff..e7d5779 100755 --- a/tox.ini +++ b/tox.ini @@ -29,7 +29,7 @@ commands = mypy --ignore-missing-imports src/arche tests [flake8] select = C,E,F,W,I,D,B,B9 -ignore = W503, E741, E501, E203, I101 +ignore = W503, E741, E501, E203, I101, F403, F405 exclude = .tox, .git,