From e336d336b9d971ab2f4eaff8452cfbe23cfa4bb5 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 12 Sep 2019 12:21:51 -0300
Subject: [PATCH 01/10] Add compare_fields

---
 docs/source/nbs/Rules.ipynb | 26 +++++++++++++++++++
 src/arche/rules/compare.py  | 47 ++++++++++++++++++++++++++++++++++
 tests/rules/test_compare.py | 51 +++++++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 src/arche/rules/compare.py
 create mode 100644 tests/rules/test_compare.py

diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb
index 5089f1a..35520ac 100644
--- a/docs/source/nbs/Rules.ipynb
+++ b/docs/source/nbs/Rules.ipynb
@@ -200,6 +200,32 @@
     "arche.rules.category.get_difference(df, target_df, [\"category\"]).show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare\n",
+    "### Fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "help(arche.rules.compare.fields)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
new file mode 100644
index 0000000..eeee202
--- /dev/null
+++ b/src/arche/rules/compare.py
@@ -0,0 +1,47 @@
+from typing import List
+
+from arche.rules.result import Result
+import pandas as pd
+
+
+def fields(
+    source_df: pd.DataFrame,
+    target_df: pd.DataFrame,
+    fields: List[str],
+    err_thr: float = 0.25,
+) -> Result:
+    """Return field values difference between jobs"""
+
+    result = Result("Fields Difference")
+
+    for field in fields:
+        source = source_df[field].dropna()
+        target = target_df[field].dropna()
+        same = source[source.isin(target)]
+        new = source[~(source.isin(target))]
+        result.add_info(
+            f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same"
+        )
+        missing = target[~(target.isin(source))]
+        missing_values = missing.values
+        if len(missing_values) == 0:
+            continue
+
+        if len(missing) < 6:
+            msg = ", ".join(missing_values.astype(str))
+        else:
+            missing_values = missing[:5].values
+            msg = f"{', '.join(missing_values.astype(str))}..."
+        msg = f"{msg} `{field}s` are missing"
+        if len(missing) / len(target_df) >= err_thr:
+            result.add_error(
+                f"{len(missing)} `{field}s` are missing",
+                errors={msg: set(missing.index)},
+            )
+        else:
+            result.add_info(
+                f"{len(missing)} `{field}s` are missing",
+                errors={msg: set(missing.index)},
+            )
+
+    return result
diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py
new file mode 100644
index 0000000..685dc08
--- /dev/null
+++ b/tests/rules/test_compare.py
@@ -0,0 +1,51 @@
+import arche.rules.compare as compare
+from arche.rules.result import Level
+from conftest import *
+import pytest
+
+
+@pytest.mark.parametrize(
+    ["source", "target", "fields", "expected"],
+    [
+        (
+            {
+                "one": list(range(50)) + [42] * 50,
+                "two": list(range(100)),
+                "three": [np.nan] * 50 + list(range(50)),
+            },
+            {
+                "one": list(range(50, 100)) + [42] * 500,
+                "two": list(range(550)),
+                "three": [np.nan] * 500 + list(range(50)),
+            },
+            ["one", "two", "three"],
+            {
+                Level.INFO: [
+                    ("100 `non NaN ones` - 49 new, 51 same",),
+                    (
+                        "50 `ones` are missing",
+                        None,
+                        {"50, 51, 52, 53, 54... `ones` are missing": set(range(50))},
+                    ),
+                    ("100 `non NaN twos` - 0 new, 100 same",),
+                    ("50 `non NaN threes` - 0 new, 50 same",),
+                ],
+                Level.ERROR: [
+                    (
+                        "450 `twos` are missing",
+                        None,
+                        {
+                            "100, 101, 102, 103, 104... `twos` are missing": set(
+                                range(100, 550)
+                            )
+                        },
+                    )
+                ],
+            },
+        )
+    ],
+)
+def test_fields(source, target, fields, expected):
+    assert compare.fields(
+        pd.DataFrame(source), pd.DataFrame(target), fields
+    ) == create_result("Fields Difference", expected)

From e89ac5a71fd2a86f1dc06600ad48fe2d4f7d13a9 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 12 Sep 2019 12:21:58 -0300
Subject: [PATCH 02/10] Ignore * warnings

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 077c79c..46f6fc8 100755
--- a/tox.ini
+++ b/tox.ini
@@ -24,7 +24,7 @@ commands =
 
 [flake8]
 select = C,E,F,W,I,D,B,B9
-ignore = W503, E741, E501, E203, I101
+ignore = W503, E741, E501, E203, I101, F403, F405
 exclude =
     .tox,
     .git,

From 46a6ab7704a6bfe352446fd04904213ea855bd13 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 12 Sep 2019 14:59:58 -0300
Subject: [PATCH 03/10] Refactor price

---
 src/arche/arche.py          |  9 ++++++
 src/arche/readers/schema.py |  2 +-
 src/arche/rules/compare.py  | 38 +++++++++++++++++++------
 src/arche/rules/price.py    | 55 ++++---------------------------------
 tests/rules/test_price.py   | 14 ++--------
 5 files changed, 47 insertions(+), 71 deletions(-)

diff --git a/src/arche/arche.py b/src/arche/arche.py
index 3e67ee5..fa0c2f0 100755
--- a/src/arche/arche.py
+++ b/src/arche/arche.py
@@ -7,6 +7,7 @@
 from arche.readers.schema import Schema, SchemaSource
 from arche.report import Report
 import arche.rules.category as category_rules
+import arche.rules.compare as compare
 import arche.rules.coverage as coverage_rules
 import arche.rules.duplicates as duplicate_rules
 import arche.rules.json_schema as schema_rules
@@ -256,3 +257,11 @@ def compare_with_customized_rules(self, source_items, target_items, tagged_field
             price_rules.compare_prices_for_same_names,
         ]:
             self.save_result(r(source_items.df, target_items.df, tagged_fields))
+        self.save_result(
+            compare.tagged_fields(
+                source_items.df,
+                target_items.df,
+                tagged_fields,
+                ["product_url_field", "name_field"],
+            )
+        )
diff --git a/src/arche/readers/schema.py b/src/arche/readers/schema.py
index 06b8883..5a1d369 100755
--- a/src/arche/readers/schema.py
+++ b/src/arche/readers/schema.py
@@ -69,7 +69,7 @@ def get_tags(schema: RawSchema) -> TaggedFields:
             property_tags = value.get("tag", [])
             if property_tags:
                 tagged_fields = Schema.get_field_tags(property_tags, key, tagged_fields)
-        return tagged_fields
+        return dict(tagged_fields)
 
     @classmethod
     def get_field_tags(
diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
index eeee202..ab2fa81 100644
--- a/src/arche/rules/compare.py
+++ b/src/arche/rules/compare.py
@@ -1,20 +1,21 @@
 from typing import List
 
-from arche.rules.result import Result
+from arche.readers.schema import TaggedFields
+from arche.rules.result import *
 import pandas as pd
 
 
 def fields(
     source_df: pd.DataFrame,
     target_df: pd.DataFrame,
-    fields: List[str],
+    names: List[str],
     err_thr: float = 0.25,
 ) -> Result:
-    """Return field values difference between jobs"""
+    """Return fields values difference between dataframes"""
 
     result = Result("Fields Difference")
 
-    for field in fields:
+    for field in names:
         source = source_df[field].dropna()
         target = target_df[field].dropna()
         same = source[source.isin(target)]
@@ -23,15 +24,13 @@ def fields(
             f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same"
         )
         missing = target[~(target.isin(source))]
-        missing_values = missing.values
-        if len(missing_values) == 0:
+        if len(missing) == 0:
             continue
 
         if len(missing) < 6:
-            msg = ", ".join(missing_values.astype(str))
+            msg = ", ".join(missing.unique().astype(str))
         else:
-            missing_values = missing[:5].values
-            msg = f"{', '.join(missing_values.astype(str))}..."
+            msg = f"{', '.join(missing.unique()[:5].astype(str))}..."
         msg = f"{msg} `{field}s` are missing"
         if len(missing) / len(target_df) >= err_thr:
             result.add_error(
@@ -45,3 +44,24 @@ def fields(
             )
 
     return result
+
+
+def tagged_fields(
+    source_df: pd.DataFrame,
+    target_df: pd.DataFrame,
+    tagged_fields: TaggedFields,
+    tags: List[str],
+) -> Result:
+    """Compare fields tagged with `tags` between two dataframes."""
+    name = f"{', '.join(tags)} Fields Difference"
+    result = Result(name)
+    fields_names = list()
+    for tag in tags:
+        if tagged_fields.get(tag):
+            fields_names.extend(tagged_fields.get(tag))
+    if not fields_names:
+        result.add_info(Outcome.SKIPPED)
+        return result
+    result = fields(source_df, target_df, fields_names)
+    result.name = name
+    return result
diff --git a/src/arche/rules/price.py b/src/arche/rules/price.py
index 868ee7a..7216e42 100755
--- a/src/arche/rules/price.py
+++ b/src/arche/rules/price.py
@@ -65,14 +65,13 @@ def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
 
 def compare_prices_for_same_urls(
     source_df: pd.DataFrame, target_df: pd.DataFrame, tagged_fields: TaggedFields
-):
+) -> Result:
     """For each pair of items that have the same `product_url_field` tagged field,
     compare `product_price_field` field
 
     Returns:
-        A result containing pairs of items with same `product_url_field`
-        from `source_df` and `target_df` which `product_price_field` differ,
-        missing and new `product_url_field` tagged fields.
+        A result containing pairs of items from `source_df` and `target_df`
+        which `product_price_field` differ.
     """
     result = Result("Compare Prices For Same Urls")
     url_field = tagged_fields.get("product_url_field")
@@ -88,26 +87,7 @@ def compare_prices_for_same_urls(
     same_urls = source_df[(source_df[url_field].isin(target_df[url_field].values))][
         url_field
     ]
-    new_urls = source_df[~(source_df[url_field].isin(target_df[url_field].values))][
-        url_field
-    ]
-    missing_urls = target_df[(~target_df[url_field].isin(source_df[url_field].values))][
-        url_field
-    ]
-
-    errors = {}
-    for url, group in missing_urls.groupby(missing_urls):
-        errors[f"Missing {url}"] = set(group.index)
-
-    if not missing_urls.empty:
-        result.add_info(
-            f"{len(missing_urls)} urls missing from the tested job", errors=errors
-        )
-    if not new_urls.empty:
-        result.add_info(f"{len(new_urls)} new urls in the tested job")
-    result.add_info(f"{len(same_urls)} same urls in both jobs")
 
-    diff_prices_count = 0
     price_field = tagged_fields.get("product_price_field")
     if not price_field:
         result.add_info("product_price_field tag is not set")
@@ -128,7 +108,6 @@ def compare_prices_for_same_urls(
                     and is_number(target_price)
                     and ratio_diff(source_price, target_price) > 0.1
                 ):
-                    diff_prices_count += 1
                     source_key = source_df[source_df[url_field] == url].index[0]
                     target_key = target_df[target_df[url_field] == url].index[0]
                     msg = (
@@ -137,7 +116,7 @@ def compare_prices_for_same_urls(
                     )
                     detailed_messages.append(msg)
 
-        res = f"{len(same_urls)} checked, {diff_prices_count} errors"
+        res = f"{len(same_urls)} checked, {len(detailed_messages)} errors"
         if detailed_messages:
             result.add_error(res, detailed="\n".join(detailed_messages))
         else:
@@ -212,33 +191,12 @@ def compare_prices_for_same_names(
     same_names = source_df[(source_df[name_field].isin(target_df[name_field].values))][
         name_field
     ]
-    new_names = source_df[~(source_df[name_field].isin(target_df[name_field].values))][
-        name_field
-    ]
-    missing_names = target_df[
-        ~(target_df[name_field].isin(source_df[name_field].values))
-    ][name_field]
 
-    errors = {}
-    for name, group in missing_names.groupby(missing_names):
-        errors[f"Missing {name}"] = set(group.index)
-
-    if not missing_names.empty:
-        result.add_info(
-            f"{len(missing_names)} names missing from the tested job", errors=errors
-        )
-    if not new_names.empty:
-        result.add_info(f"{len(new_names)} new names in the tested job")
-    result.add_info(f"{len(same_names)} same names in both jobs")
-
-    price_tag = "product_price_field"
-    price_field = tagged_fields.get(price_tag)
+    price_field = tagged_fields.get("product_price_field")
     if not price_field:
         result.add_info("product_price_field tag is not set")
         return result
-
     price_field = price_field[0]
-    count = 0
 
     detailed_messages = []
     for name in same_names:
@@ -247,7 +205,6 @@ def compare_prices_for_same_names(
             target_price = target_df[target_df[name_field] == name][price_field].iloc[0]
             if is_number(source_price) and is_number(target_price):
                 if ratio_diff(source_price, target_price) > 0.1:
-                    count += 1
                     source_key = source_df[source_df[name_field] == name].index[0]
                     target_key = target_df[target_df[name_field] == name].index[0]
                     msg = (
@@ -256,7 +213,7 @@ def compare_prices_for_same_names(
                     )
                     detailed_messages.append(msg)
 
-    result_msg = f"{len(same_names)} checked, {count} errors"
+    result_msg = f"{len(same_names)} checked, {len(detailed_messages)} errors"
     if detailed_messages:
         result.add_error(result_msg, detailed="\n".join(detailed_messages))
     else:
diff --git a/tests/rules/test_price.py b/tests/rules/test_price.py
index b6dca99..237a13c 100755
--- a/tests/rules/test_price.py
+++ b/tests/rules/test_price.py
@@ -63,7 +63,6 @@ def test_compare_was_now(data, tagged_fields, expected_messages):
         {"price": [1.15, "2.3", 6], "url": ["http://1", "http://2", np.nan]},
         {"product_price_field": ["price"], "product_url_field": ["url"]},
         {
-            Level.INFO: [("2 same urls in both jobs",)],
             Level.ERROR: [
                 (
                     "2 checked, 2 errors",
@@ -74,7 +73,7 @@ def test_compare_was_now(data, tagged_fields, expected_messages):
                         "target price is 1.15 for 0"
                     ),
                 )
-            ],
+            ]
         },
     )
 ]
@@ -134,15 +133,6 @@ def test_compare_names_for_same_urls(
             {"name": ["Coffee", "Tea", "Wine"], "price": [4.0, 4.8, 20.0]},
             {"name_field": ["name"], "product_price_field": ["price"]},
             {
-                Level.INFO: [
-                    (
-                        "1 names missing from the tested job",
-                        None,
-                        {"Missing Wine": {2}},
-                    ),
-                    ("1 new names in the tested job",),
-                    ("2 same names in both jobs",),
-                ],
                 Level.ERROR: [
                     (
                         "2 checked, 1 errors",
@@ -151,7 +141,7 @@ def test_compare_names_for_same_urls(
                             "target price is 4.0 for 0"
                         ),
                     )
-                ],
+                ]
             },
         )
     ],

From 6f2b100e127e2d1b7296ffce74e91280add62efe Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Fri, 13 Sep 2019 17:25:16 -0300
Subject: [PATCH 04/10] Support nested structures

---
 src/arche/rules/compare.py  | 15 +++++++++++----
 tests/rules/test_compare.py | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
index ab2fa81..9af27d6 100644
--- a/src/arche/rules/compare.py
+++ b/src/arche/rules/compare.py
@@ -14,16 +14,23 @@ def fields(
     """Return fields values difference between dataframes"""
 
     result = Result("Fields Difference")
-
     for field in names:
         source = source_df[field].dropna()
         target = target_df[field].dropna()
-        same = source[source.isin(target)]
-        new = source[~(source.isin(target))]
+        try:
+            same = source[source.isin(target)]
+            new = source[~(source.isin(target))]
+            missing = target[~(target.isin(source))]
+        except SystemError:
+            source = source.apply(str)
+            target = target.apply(str)
+            same = source[source.isin(target)]
+            new = source[~(source.isin(target))]
+            missing = target[~(target.isin(source))]
+
         result.add_info(
             f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same"
         )
-        missing = target[~(target.isin(source))]
         if len(missing) == 0:
             continue
 
diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py
index 685dc08..a1c8c7e 100644
--- a/tests/rules/test_compare.py
+++ b/tests/rules/test_compare.py
@@ -9,7 +9,7 @@
     [
         (
             {
-                "one": list(range(50)) + [42] * 50,
+                "one": list(range(50)) + ["42"] * 50,
                 "two": list(range(100)),
                 "three": [np.nan] * 50 + list(range(50)),
             },
@@ -21,7 +21,7 @@
             ["one", "two", "three"],
             {
                 Level.INFO: [
-                    ("100 `non NaN ones` - 49 new, 51 same",),
+                    ("100 `non NaN ones` - 99 new, 1 same",),
                     (
                         "50 `ones` are missing",
                         None,
@@ -42,7 +42,34 @@
                     )
                 ],
             },
-        )
+        ),
+        (
+            {
+                "four": [{i} for i in range(10)]
+                + [{"k": {"k": i}} for i in range(10)]
+                + ["l"] * 80
+            },
+            {
+                "four": [{i} for i in range(20)]
+                + [{"k": {"k": i}} for i in range(10)]
+                + ["l"] * 520
+            },
+            ["four"],
+            {
+                Level.INFO: [
+                    ("100 `non NaN fours` - 0 new, 100 same",),
+                    (
+                        "10 `fours` are missing",
+                        None,
+                        {
+                            "{10}, {11}, {12}, {13}, {14}... `fours` are missing": set(
+                                range(10, 20)
+                            )
+                        },
+                    ),
+                ]
+            },
+        ),
     ],
 )
 def test_fields(source, target, fields, expected):

From 4df498ac33c09e0c945fe291fa7c1ef1c9272bb4 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Wed, 18 Sep 2019 11:46:45 -0300
Subject: [PATCH 05/10] Wee update

---
 src/arche/rules/compare.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
index 9af27d6..4e5e608 100644
--- a/src/arche/rules/compare.py
+++ b/src/arche/rules/compare.py
@@ -5,6 +5,9 @@
 import pandas as pd
 
 
+MAX_MISSING_VALUES = 6
+
+
 def fields(
     source_df: pd.DataFrame,
     target_df: pd.DataFrame,
@@ -22,8 +25,8 @@ def fields(
             new = source[~(source.isin(target))]
             missing = target[~(target.isin(source))]
         except SystemError:
-            source = source.apply(str)
-            target = target.apply(str)
+            source = source.astype(str)
+            target = target.astype(str)
             same = source[source.isin(target)]
             new = source[~(source.isin(target))]
             missing = target[~(target.isin(source))]
@@ -34,7 +37,7 @@ def fields(
         if len(missing) == 0:
             continue
 
-        if len(missing) < 6:
+        if len(missing) < MAX_MISSING_VALUES:
             msg = ", ".join(missing.unique().astype(str))
         else:
             msg = f"{', '.join(missing.unique()[:5].astype(str))}..."

From 10897ceebbf50b10f35474bd98bad0ea029f2d68 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 19 Sep 2019 15:03:02 -0300
Subject: [PATCH 06/10] Add normalization

---
 src/arche/__init__.py       |  1 +
 src/arche/rules/compare.py  | 34 ++++++++++++++++++++++++----------
 tests/rules/test_compare.py | 12 +++++++-----
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/arche/__init__.py b/src/arche/__init__.py
index cf3d18e..109f08f 100755
--- a/src/arche/__init__.py
+++ b/src/arche/__init__.py
@@ -1,4 +1,5 @@
 import logging
+from typing import *  # noqa
 
 __version__ = "0.3.6"
 SH_URL = "https://app.scrapinghub.com/p"  # noqa
diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
index 4e5e608..dd7111e 100644
--- a/src/arche/rules/compare.py
+++ b/src/arche/rules/compare.py
@@ -1,8 +1,5 @@
-from typing import List
-
 from arche.readers.schema import TaggedFields
 from arche.rules.result import *
-import pandas as pd
 
 
 MAX_MISSING_VALUES = 6
@@ -12,24 +9,41 @@ def fields(
     source_df: pd.DataFrame,
     target_df: pd.DataFrame,
     names: List[str],
+    normalize: bool = False,
     err_thr: float = 0.25,
 ) -> Result:
-    """Return fields values difference between dataframes"""
+    """Return fields values difference between dataframe.
+
+    Args:
+        names - a list of field names
+        normalize - if set, all fields converted to str and processed with lower() and strip()
+
+    Returns:
+        Result with same, missing and new values.
+    """
+
+    def get_difference(
+        left: pd.Series, right: pd.Series
+    ) -> (pd.Series, pd.Series, pd.Series):
+        return (
+            left[left.isin(right)],
+            left[~(left.isin(right))],
+            right[~(right.isin(left))],
+        )
 
     result = Result("Fields Difference")
     for field in names:
         source = source_df[field].dropna()
         target = target_df[field].dropna()
+        if normalize:
+            source = source.astype(str).str.lower().str.strip()
+            target = target.astype(str).str.lower().str.strip()
         try:
-            same = source[source.isin(target)]
-            new = source[~(source.isin(target))]
-            missing = target[~(target.isin(source))]
+            same, new, missing = get_difference(source, target)
         except SystemError:
             source = source.astype(str)
             target = target.astype(str)
-            same = source[source.isin(target)]
-            new = source[~(source.isin(target))]
-            missing = target[~(target.isin(source))]
+            same, new, missing = get_difference(source, target)
 
         result.add_info(
             f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same"
diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py
index a1c8c7e..42802c2 100644
--- a/tests/rules/test_compare.py
+++ b/tests/rules/test_compare.py
@@ -5,7 +5,7 @@
 
 
 @pytest.mark.parametrize(
-    ["source", "target", "fields", "expected"],
+    ["source", "target", "fields", "normalize", "expected"],
     [
         (
             {
@@ -19,6 +19,7 @@
                 "three": [np.nan] * 500 + list(range(50)),
             },
             ["one", "two", "three"],
+            False,
             {
                 Level.INFO: [
                     ("100 `non NaN ones` - 99 new, 1 same",),
@@ -46,15 +47,16 @@
         (
             {
                 "four": [{i} for i in range(10)]
-                + [{"k": {"k": i}} for i in range(10)]
+                + [{"K": {"k": i}} for i in range(10)]
                 + ["l"] * 80
             },
             {
                 "four": [{i} for i in range(20)]
                 + [{"k": {"k": i}} for i in range(10)]
-                + ["l"] * 520
+                + ["L"] * 520
             },
             ["four"],
+            True,
             {
                 Level.INFO: [
                     ("100 `non NaN fours` - 0 new, 100 same",),
@@ -72,7 +74,7 @@
         ),
     ],
 )
-def test_fields(source, target, fields, expected):
+def test_fields(source, target, fields, normalize, expected):
     assert compare.fields(
-        pd.DataFrame(source), pd.DataFrame(target), fields
+        pd.DataFrame(source), pd.DataFrame(target), fields, normalize
     ) == create_result("Fields Difference", expected)

From da3b312df898f3da964197374eaaf5c3c2e8d264 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 26 Sep 2019 15:47:26 -0300
Subject: [PATCH 07/10] Add more_stats to easily access all data, replace
 Result class eq with assert

---
 src/arche/rules/compare.py  |   5 +-
 src/arche/rules/result.py   |  33 ++----------
 tests/conftest.py           |  63 ++++++++++++++--------
 tests/rules/test_compare.py | 102 +++++++++++++++++++++++-------------
 tests/rules/test_result.py  |  84 -----------------------------
 tests/test_conftest.py      |  30 +++++++++++
 6 files changed, 147 insertions(+), 170 deletions(-)
 create mode 100644 tests/test_conftest.py

diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
index dd7111e..eb4a237 100644
--- a/src/arche/rules/compare.py
+++ b/src/arche/rules/compare.py
@@ -45,6 +45,10 @@ def get_difference(
             target = target.astype(str)
             same, new, missing = get_difference(source, target)
 
+        same.name, new.name, missing.name = (None, None, None)
+        result.more_stats.update(
+            {f"{field}": {"same": same, "new": new, "missing": missing}}
+        )
         result.add_info(
             f"{len(source)} `non NaN {field}s` - {len(new)} new, {len(same)} same"
         )
@@ -66,7 +70,6 @@ def get_difference(
                 f"{len(missing)} `{field}s` are missing",
                 errors={msg: set(missing.index)},
             )
-
     return result
 
 
diff --git a/src/arche/rules/result.py b/src/arche/rules/result.py
index 172ee46..6eae28f 100755
--- a/src/arche/rules/result.py
+++ b/src/arche/rules/result.py
@@ -65,35 +65,12 @@ class Result:
 
     name: str
     messages: Dict[Level, List[Message]] = field(default_factory=dict)
-    _stats: Optional[List[Stat]] = field(default_factory=list)
-    items_count: Optional[int] = 0
+    _stats: List[Stat] = field(default_factory=list)
+    more_stats: Dict[str, Dict] = field(default_factory=dict)
+    items_count: int = 0
     _err_keys: Set[Union[str, int]] = field(default_factory=set)
-    _err_items_count: Optional[int] = 0
-    _figures: Optional[List[go.FigureWidget]] = field(default_factory=list)
-
-    def __eq__(self, other):
-        for left, right in zip(self.stats, other.stats):
-            if not self.tensors_equal(left, right):
-                return False
-
-        return (
-            self.name == other.name
-            and self.messages == other.messages
-            and self.items_count == other.items_count
-            and self.err_items_count == other.err_items_count
-            and len(self.stats) == len(other.stats)
-        )
-
-    @staticmethod
-    def tensors_equal(left: Stat, right: Stat):
-        try:
-            if isinstance(left, pd.DataFrame):
-                pd.testing.assert_frame_equal(left, right)
-            else:
-                pd.testing.assert_series_equal(left, right)
-            return True
-        except AssertionError:
-            return False
+    _err_items_count: int = 0
+    _figures: List[go.FigureWidget] = field(default_factory=list)
 
     @property
     def info(self):
diff --git a/tests/conftest.py b/tests/conftest.py
index 1c53809..747f032 100755
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,6 @@
 from copy import deepcopy
-from itertools import zip_longest
-from typing import Dict, Iterable, List, Optional
+from functools import partial
+from typing import Any, Dict, Iterable, List, Optional
 
 from arche.readers.items import CollectionItems, JobItems
 from arche.rules.result import Level, Message, Result, Stat
@@ -209,6 +209,7 @@ def create_result(
     messages: Dict[Level, List[Message]],
     stats: Optional[List[Stat]] = None,
     items_count: Optional[int] = None,
+    more_stats: Optional[Dict[str, Any]] = None,
 ) -> Result:
     result = Result(rule_name)
     for level, messages in messages.items():
@@ -217,30 +218,50 @@ def create_result(
 
     if stats:
         result.stats = stats
+    if more_stats:
+        result.more_stats = more_stats
     if items_count:
         result.items_count = items_count
     return result
 
 
-def pytest_assertrepr_compare(op, left, right):
-    if isinstance(left, Result) and isinstance(right, Result) and op == "==":
-        assert_msgs = ["Results are equal"]
-        for (left_n, left_v), (_, right_v) in zip_longest(
-            left.__dict__.items(), right.__dict__.items()
-        ):
-            if left_n == "_stats":
-                for left_stat, right_stat in zip_longest(left_v, right_v):
-                    try:
-                        if isinstance(left_stat, pd.DataFrame):
-                            pd.testing.assert_frame_equal(left_stat, right_stat)
-                        else:
-                            pd.testing.assert_series_equal(left_stat, right_stat)
-                    except AssertionError as e:
-                        assert_msgs.extend([f"{left_stat}", "!=", f"{right_stat}"])
-                        assert_msgs.extend(str(e).split("\n"))
-            elif left_v != right_v:
-                assert_msgs.extend([f"{left_v}", "!=", f"{right_v}"])
-        return assert_msgs
+def assert_results_equal(left: Result, right: Result, **kwargs):
+    attrs = [
+        "name",
+        "messages",
+        "items_count",
+        "_err_items_count",
+        "_err_keys",
+        "_figures",
+    ]
+    for attr in attrs:
+        assert getattr(left, attr) == getattr(right, attr)
+    assert len(left.stats) == len(right.stats)
+
+    def assert_dicts_equal(left: Dict, right: Dict):
+        assert left.keys() == right.keys()
+        assert len(left.items()) == len(right.items())
+        for left_v, right_v in zip(left.values(), right.values()):
+            if isinstance(left_v, dict):
+                assert_dicts_equal(left_v, right_v)
+            elif isinstance(left_v, (pd.Series, pd.DataFrame)):
+                assert_tensors_equal(left_v, right_v, **kwargs)
+            else:
+                assert left_v == right_v
+
+    for left_t, right_t in zip(left._stats, right._stats):
+        assert_tensors_equal(left_t, right_t)
+
+    assert_dicts_equal(left.more_stats, right.more_stats)
+
+
+def assert_tensors_equal(left: Stat, right: Stat, **kwargs):
+    if isinstance(left, pd.DataFrame):
+        assert_f = partial(pd.testing.assert_frame_equal, **kwargs)
+    elif isinstance(left, pd.Series):
+        assert_f = partial(pd.testing.assert_series_equal, **kwargs)
+
+    assert_f(left, right)
 
 
 def create_named_df(data: Dict, index: List[str], name: str) -> pd.DataFrame:
diff --git a/tests/rules/test_compare.py b/tests/rules/test_compare.py
index 42802c2..ff830a3 100644
--- a/tests/rules/test_compare.py
+++ b/tests/rules/test_compare.py
@@ -5,76 +5,106 @@
 
 
 @pytest.mark.parametrize(
-    ["source", "target", "fields", "normalize", "expected"],
+    ["source", "target", "fields", "normalize", "expected", "more_stats"],
     [
         (
             {
-                "one": list(range(50)) + ["42"] * 50,
-                "two": list(range(100)),
-                "three": [np.nan] * 50 + list(range(50)),
+                "one": list(range(5)) + ["42"] * 5,
+                "two": list(range(10)),
+                "three": [np.nan] * 5 + list(range(5)),
             },
             {
-                "one": list(range(50, 100)) + [42] * 500,
-                "two": list(range(550)),
-                "three": [np.nan] * 500 + list(range(50)),
+                "one": list(range(5, 10)) + [4] * 6,
+                "two": list(range(11)),
+                "three": [np.nan] * 10 + [1],
             },
             ["one", "two", "three"],
             False,
             {
                 Level.INFO: [
-                    ("100 `non NaN ones` - 99 new, 1 same",),
-                    (
-                        "50 `ones` are missing",
-                        None,
-                        {"50, 51, 52, 53, 54... `ones` are missing": set(range(50))},
-                    ),
-                    ("100 `non NaN twos` - 0 new, 100 same",),
-                    ("50 `non NaN threes` - 0 new, 50 same",),
+                    ("10 `non NaN ones` - 9 new, 1 same",),
+                    ("10 `non NaN twos` - 0 new, 10 same",),
+                    ("1 `twos` are missing", None, {"10 `twos` are missing": {10}}),
+                    ("5 `non NaN threes` - 4 new, 1 same",),
                 ],
                 Level.ERROR: [
                     (
-                        "450 `twos` are missing",
+                        "5 `ones` are missing",
                         None,
-                        {
-                            "100, 101, 102, 103, 104... `twos` are missing": set(
-                                range(100, 550)
-                            )
-                        },
+                        {"5, 6, 7, 8, 9 `ones` are missing": set(range(5))},
                     )
                 ],
             },
+            {
+                "one": {
+                    "same": pd.Series([4], index=[4], dtype="object"),
+                    "new": pd.Series(
+                        [0, 1, 2, 3] + ["42"] * 5, index=[0, 1, 2, 3, 5, 6, 7, 8, 9]
+                    ),
+                    "missing": pd.Series(list(range(5, 10))),
+                },
+                "two": {
+                    "same": pd.Series(list(range(10))),
+                    "new": pd.Series(dtype=np.int64),
+                    "missing": pd.Series([10], index=[10]),
+                },
+                "three": {
+                    "same": pd.Series([1.0], index=[6]),
+                    "new": pd.Series([0.0, 2.0, 3.0, 4.0], index=[5, 7, 8, 9]),
+                    "missing": pd.Series(),
+                },
+            },
         ),
         (
             {
-                "four": [{i} for i in range(10)]
-                + [{"K": {"k": i}} for i in range(10)]
-                + ["l"] * 80
+                "four": [{i} for i in range(2)]
+                + [{"K": {"k": i}} for i in range(2)]
+                + ["l"] * 6
             },
             {
-                "four": [{i} for i in range(20)]
-                + [{"k": {"k": i}} for i in range(10)]
-                + ["L"] * 520
+                "four": [{i} for i in range(4)]
+                + [{"k": {"k": i}} for i in range(4)]
+                + ["L"] * 20
             },
             ["four"],
             True,
             {
                 Level.INFO: [
-                    ("100 `non NaN fours` - 0 new, 100 same",),
+                    ("10 `non NaN fours` - 0 new, 10 same",),
                     (
-                        "10 `fours` are missing",
+                        "4 `fours` are missing",
                         None,
                         {
-                            "{10}, {11}, {12}, {13}, {14}... `fours` are missing": set(
-                                range(10, 20)
-                            )
+                            "{2}, {3}, {'k': {'k': 2}}, {'k': {'k': 3}} `fours` are missing": {
+                                2,
+                                3,
+                                6,
+                                7,
+                            }
                         },
                     ),
                 ]
             },
+            {
+                "four": {
+                    "same": pd.Series(
+                        [str({i}) for i in range(2)]
+                        + [str({"k": {"k": i}}) for i in range(2)]
+                        + ["l"] * 6
+                    ),
+                    "new": pd.Series(dtype=object),
+                    "missing": pd.Series(
+                        ["{2}", "{3}", "{'k': {'k': 2}}", "{'k': {'k': 3}}"],
+                        index={2, 3, 6, 7},
+                    ),
+                }
+            },
         ),
     ],
 )
-def test_fields(source, target, fields, normalize, expected):
-    assert compare.fields(
-        pd.DataFrame(source), pd.DataFrame(target), fields, normalize
-    ) == create_result("Fields Difference", expected)
+def test_fields(source, target, fields, normalize, expected, more_stats):
+    assert_results_equal(
+        compare.fields(pd.DataFrame(source), pd.DataFrame(target), fields, normalize),
+        create_result("Fields Difference", expected, more_stats=more_stats),
+        check_index_type=False,
+    )
diff --git a/tests/rules/test_result.py b/tests/rules/test_result.py
index 23ca2b6..c9ebb0b 100755
--- a/tests/rules/test_result.py
+++ b/tests/rules/test_result.py
@@ -57,34 +57,6 @@ def test_result_err_keys(messages, true_err_keys):
     assert Result("x", messages=messages).err_keys == true_err_keys
 
 
-@pytest.mark.parametrize(
-    "source, target",
-    [
-        (
-            pd.Series([0, 1], index=["f", "l"], name="n"),
-            pd.Series([0, 1], index=["f", "l"], name="n"),
-        ),
-        (pd.DataFrame([0, 1]), pd.DataFrame([0, 1])),
-    ],
-)
-def test_tensors_equal(source, target):
-    assert Result.tensors_equal(source, target)
-
-
-@pytest.mark.parametrize(
-    "source, target",
-    [
-        (
-            pd.Series([0, 1], index=["f", "l"], name="s"),
-            pd.Series([0, 1], index=["f", "l"], name="n"),
-        ),
-        (pd.DataFrame([0, 1]), pd.DataFrame([0, 1], index=["m", "s"])),
-    ],
-)
-def test_tensors_not_equal(source, target):
-    assert not Result.tensors_equal(source, target)
-
-
 @pytest.mark.parametrize(
     "message, stats, outputs",
     [
@@ -111,59 +83,3 @@ def test_show(mocker, capsys, message, stats, outputs):
     res.show()
     mock_pio_show.assert_called_once_with(res.figures[0])
     mocked_md.assert_has_calls(mocker.call(o) for o in outputs)
-
-
-@pytest.mark.parametrize(
-    "left_params, right_params",
-    [
-        (
-            (
-                "s",
-                {Level.INFO: ["sum", "det", {"err1": [0, 1]}]},
-                [pd.Series([0], name="s"), pd.DataFrame({"s": [0]})],
-                2,
-                ["err1"],
-                1,
-            ),
-            (
-                "s",
-                {Level.INFO: ["sum", "det", {"err1": [0, 1]}]},
-                [pd.Series([0], name="s"), pd.DataFrame({"s": [0]})],
-                2,
-                ["err1"],
-                1,
-            ),
-        ),
-        (("s",), ("s",)),
-    ],
-)
-def test_result_equal(left_params, right_params):
-    assert Result(*left_params) == Result(*right_params)
-
-
-@pytest.mark.parametrize(
-    "left_params, right_params",
-    [
-        (
-            (
-                "s",
-                {Level.INFO: ["sum", "det", {"err1": [0, 1]}]},
-                [pd.Series([0], name="A name"), pd.DataFrame([0])],
-                2,
-                ["err1"],
-                1,
-            ),
-            (
-                "s",
-                {Level.INFO: ["sum", "det", {"err1": [0, 1]}]},
-                [pd.Series([0], name="A series name"), pd.DataFrame([0])],
-                2,
-                ["err1"],
-                1,
-            ),
-        ),
-        (("s",), ("t",)),
-    ],
-)
-def test_result_not_equal(left_params, right_params):
-    assert Result(*left_params) != Result(*right_params)
diff --git a/tests/test_conftest.py b/tests/test_conftest.py
new file mode 100644
index 0000000..2b12c17
--- /dev/null
+++ b/tests/test_conftest.py
@@ -0,0 +1,30 @@
+from conftest import *
+
+
+@pytest.mark.parametrize(
+    "source, target",
+    [
+        (
+            pd.Series([0, 1], index=["f", "l"], name="n"),
+            pd.Series([0, 1], index=["f", "l"], name="n"),
+        ),
+        (pd.DataFrame([0, 1]), pd.DataFrame([0, 1])),
+    ],
+)
+def test_assert_tensors_equal(source, target):
+    assert_tensors_equal(source, target)
+
+
+@pytest.mark.parametrize(
+    "source, target",
+    [
+        (
+            pd.Series([0, 1], index=["f", "l"], name="s"),
+            pd.Series([0, 1], index=["f", "l"], name="n"),
+        ),
+        (pd.DataFrame([0, 1]), pd.DataFrame([0, 1], index=["m", "s"])),
+    ],
+)
+def test_assert_tensors_not_equal(source, target):
+    with pytest.raises(AssertionError):
+        assert_tensors_equal(source, target)

From 22cd4009fe6fcade0a18923192a09730cbe445c7 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 26 Sep 2019 15:48:14 -0300
Subject: [PATCH 08/10] Update rules to new assert

---
 tests/rules/test_category.py    | 32 ++++++++++++++++++++------------
 tests/rules/test_coverage.py    | 26 +++++++++++++++-----------
 tests/rules/test_duplicates.py  | 26 ++++++++++++++++----------
 tests/rules/test_json_schema.py | 20 +++++++++++++-------
 tests/rules/test_metadata.py    | 23 ++++++++++++++---------
 tests/rules/test_others.py      | 13 ++++++++-----
 tests/rules/test_price.py       | 22 +++++++++++++++-------
 7 files changed, 101 insertions(+), 61 deletions(-)

diff --git a/tests/rules/test_category.py b/tests/rules/test_category.py
index 462d4c4..9f9963a 100755
--- a/tests/rules/test_category.py
+++ b/tests/rules/test_category.py
@@ -1,6 +1,6 @@
 import arche.rules.category as c
 from arche.rules.result import Level
-from conftest import create_result, create_named_df
+from conftest import *
 import numpy as np
 import pandas as pd
 import pytest
@@ -21,8 +21,11 @@
     ],
 )
 def test_get_coverage_per_category(data, cat_names, expected_messages, expected_stats):
-    assert c.get_coverage_per_category(pd.DataFrame(data), cat_names) == create_result(
-        "Coverage For Scraped Categories", expected_messages, expected_stats
+    assert_results_equal(
+        c.get_coverage_per_category(pd.DataFrame(data), cat_names),
+        create_result(
+            "Coverage For Scraped Categories", expected_messages, expected_stats
+        ),
     )
 
 
@@ -81,10 +84,11 @@ def test_get_coverage_per_category(data, cat_names, expected_messages, expected_
     ],
 )
 def test_get_difference(source, target, categories, expected_messages, expected_stats):
-    assert c.get_difference(
-        pd.DataFrame(source), pd.DataFrame(target), categories
-    ) == create_result(
-        "Category Coverage Difference", expected_messages, stats=expected_stats
+    assert_results_equal(
+        c.get_difference(pd.DataFrame(source), pd.DataFrame(target), categories),
+        create_result(
+            "Category Coverage Difference", expected_messages, stats=expected_stats
+        ),
     )
 
 
@@ -96,8 +100,10 @@ def test_get_difference(source, target, categories, expected_messages, expected_
     ],
 )
 def test_get_no_categories(data, expected_message):
-    result = c.get_categories(pd.DataFrame(data))
-    assert result == create_result("Categories", {Level.INFO: [(expected_message,)]})
+    assert_results_equal(
+        c.get_categories(pd.DataFrame(data)),
+        create_result("Categories", {Level.INFO: [(expected_message,)]}),
+    )
 
 
 @pytest.mark.parametrize(
@@ -129,9 +135,11 @@ def test_get_no_categories(data, expected_message):
     ],
 )
 def test_get_categories(data, max_uniques, expected_stats, expected_message):
-    result = c.get_categories(pd.DataFrame(data), max_uniques)
-    assert result == create_result(
-        "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats
+    assert_results_equal(
+        c.get_categories(pd.DataFrame(data), max_uniques),
+        create_result(
+            "Categories", {Level.INFO: [(expected_message,)]}, stats=expected_stats
+        ),
     )
 
 
diff --git a/tests/rules/test_coverage.py b/tests/rules/test_coverage.py
index 9b0cf9c..8433452 100755
--- a/tests/rules/test_coverage.py
+++ b/tests/rules/test_coverage.py
@@ -2,7 +2,7 @@
 
 import arche.rules.coverage as cov
 from arche.rules.result import Level, Outcome
-from conftest import create_result, create_named_df, Job
+from conftest import *
 import pandas as pd
 import pytest
 
@@ -36,8 +36,10 @@
     ],
 )
 def test_check_fields_coverage(df, expected_messages, expected_stats):
-    result = cov.check_fields_coverage(df)
-    assert result == create_result("Fields Coverage", expected_messages, expected_stats)
+    assert_results_equal(
+        cov.check_fields_coverage(df),
+        create_result("Fields Coverage", expected_messages, expected_stats),
+    )
 
 
 @pytest.mark.parametrize(
@@ -114,11 +116,11 @@ def test_check_fields_coverage(df, expected_messages, expected_stats):
     ],
 )
 def test_get_difference(source_stats, target_stats, expected_messages, expected_stats):
-    result = cov.get_difference(
-        Job(stats=source_stats, key="s"), Job(stats=target_stats, key="t")
-    )
-    assert result == create_result(
-        "Coverage Difference", expected_messages, stats=expected_stats
+    assert_results_equal(
+        cov.get_difference(
+            Job(stats=source_stats, key="s"), Job(stats=target_stats, key="t")
+        ),
+        create_result("Coverage Difference", expected_messages, stats=expected_stats),
     )
 
 
@@ -133,7 +135,7 @@ def test_compare_scraped_fields(source_cols, target_cols, expected_messages):
     result = cov.compare_scraped_fields(
         pd.DataFrame([], columns=source_cols), pd.DataFrame([], columns=target_cols)
     )
-    assert result == create_result("Scraped Fields", expected_messages)
+    assert_results_equal(result, create_result("Scraped Fields", expected_messages))
 
 
 @pytest.mark.parametrize(
@@ -191,5 +193,7 @@ def test_anomalies(
         for key, counts, input_values in jobs_stats
     ]
     mocker.patch("arche.rules.coverage.api.get_jobs", return_value=jobs)
-    result = cov.anomalies(jobs_stats[-1][0], [key for key, *_ in jobs_stats[:-1]])
-    assert result == create_result("Anomalies", expected_messages, stats=stats)
+    assert_results_equal(
+        cov.anomalies(jobs_stats[-1][0], [key for key, *_ in jobs_stats[:-1]]),
+        create_result("Anomalies", expected_messages, stats=stats),
+    )
diff --git a/tests/rules/test_duplicates.py b/tests/rules/test_duplicates.py
index 90f6a8b..9dd2e7f 100755
--- a/tests/rules/test_duplicates.py
+++ b/tests/rules/test_duplicates.py
@@ -1,6 +1,6 @@
 import arche.rules.duplicates as duplicates
 from arche.rules.result import Level, Outcome
-from conftest import create_result
+from conftest import *
 import numpy as np
 import pandas as pd
 import pytest
@@ -45,8 +45,11 @@
 @pytest.mark.parametrize("data, tagged_fields, expected_messages", unique_inputs)
 def test_find_by_unique(data, tagged_fields, expected_messages):
     df = pd.DataFrame(data)
-    assert duplicates.find_by_unique(df, tagged_fields) == create_result(
-        "Duplicates By **unique** Tag", expected_messages, items_count=len(df)
+    assert_results_equal(
+        duplicates.find_by_unique(df, tagged_fields),
+        create_result(
+            "Duplicates By **unique** Tag", expected_messages, items_count=len(df)
+        ),
     )
 
 
@@ -80,8 +83,9 @@ def test_find_by_unique(data, tagged_fields, expected_messages):
 )
 def test_find_by(data, columns, expected_messages):
     df = pd.DataFrame(data)
-    assert duplicates.find_by(df, columns) == create_result(
-        "Duplicates", expected_messages, items_count=len(df)
+    assert_results_equal(
+        duplicates.find_by(df, columns),
+        create_result("Duplicates", expected_messages, items_count=len(df)),
     )
 
 
@@ -111,9 +115,11 @@ def test_find_by(data, columns, expected_messages):
 )
 def test_find_by_name_url(data, tagged_fields, expected_messages):
     df = pd.DataFrame(data)
-    result = duplicates.find_by_name_url(df, tagged_fields)
-    assert result == create_result(
-        "Duplicates By **name_field, product_url_field** Tags",
-        expected_messages,
-        items_count=len(df),
+    assert_results_equal(
+        duplicates.find_by_name_url(df, tagged_fields),
+        create_result(
+            "Duplicates By **name_field, product_url_field** Tags",
+            expected_messages,
+            items_count=len(df),
+        ),
     )
diff --git a/tests/rules/test_json_schema.py b/tests/rules/test_json_schema.py
index da0bdd2..80cd0d6 100755
--- a/tests/rules/test_json_schema.py
+++ b/tests/rules/test_json_schema.py
@@ -1,6 +1,6 @@
 from arche.rules.json_schema import check_tags, validate
 from arche.rules.result import Level
-from conftest import create_result
+from conftest import *
 import pytest
 
 
@@ -114,8 +114,10 @@
     "source_columns, target_columns, tags, expected_messages", tags_inputs
 )
 def test_check_tags(source_columns, target_columns, tags, expected_messages):
-    result = check_tags(source_columns, target_columns, tags)
-    assert result == create_result("Tags", expected_messages)
+    assert_results_equal(
+        check_tags(source_columns, target_columns, tags),
+        create_result("Tags", expected_messages),
+    )
 
 
 @pytest.mark.parametrize(
@@ -147,10 +149,14 @@ def test_check_tags(source_columns, target_columns, tags, expected_messages):
     ],
 )
 def test_validate(get_raw_items, schema, expected_messages):
-    result = validate(schema, get_raw_items, range(len(get_raw_items)))
-    assert result == create_result("JSON Schema Validation", expected_messages)
+    assert_results_equal(
+        validate(schema, get_raw_items, range(len(get_raw_items))),
+        create_result("JSON Schema Validation", expected_messages),
+    )
 
 
 def test_validate_passed(get_schema, get_raw_items):
-    result = validate(get_schema, get_raw_items, range(len(get_raw_items)))
-    assert result == create_result("JSON Schema Validation", {})
+    assert_results_equal(
+        validate(get_schema, get_raw_items, range(len(get_raw_items))),
+        create_result("JSON Schema Validation", {}),
+    )
diff --git a/tests/rules/test_metadata.py b/tests/rules/test_metadata.py
index 0f58303..5e8afd7 100755
--- a/tests/rules/test_metadata.py
+++ b/tests/rules/test_metadata.py
@@ -6,7 +6,7 @@
     compare_response_ratio,
 )
 from arche.rules.result import Level
-from conftest import create_result, Job
+from conftest import *
 import pytest
 
 
@@ -32,7 +32,9 @@ def test_check_errors(get_job, error_count, expected_messages):
     job.metadata = {"scrapystats": error_count}
     job.key = "112358/13/21"
 
-    assert check_errors(job) == create_result("Job Errors", expected_messages)
+    assert_results_equal(
+        check_errors(job), create_result("Job Errors", expected_messages)
+    )
 
 
 outcome_input = [
@@ -70,8 +72,9 @@ def test_check_outcome(get_job, metadata, expected_messages):
     job = get_job
     job.metadata = metadata
 
-    result = check_outcome(job)
-    assert result == create_result("Job Outcome", expected_messages)
+    assert_results_equal(
+        check_outcome(job), create_result("Job Outcome", expected_messages)
+    )
 
 
 time_inputs = [
@@ -124,8 +127,10 @@ def test_compare_finish_time(
     source_job.metadata = source_metadata
     target_job.metadata = target_metadata
 
-    result = compare_finish_time(source_job, target_job)
-    assert result == create_result("Finish Time", expected_messages)
+    assert_results_equal(
+        compare_finish_time(source_job, target_job),
+        create_result("Finish Time", expected_messages),
+    )
 
 
 compare_response_ratio_inputs = [
@@ -163,7 +168,7 @@ def test_compare_response_ratio(
     source_job = Job(stats=source_stats, metadata=source_metadata)
     target_job = Job(stats=target_stats, metadata=target_metadata)
 
-    result = compare_response_ratio(source_job, target_job)
-    assert result == create_result(
-        "Compare Responses Per Item Ratio", expected_messages
+    assert_results_equal(
+        compare_response_ratio(source_job, target_job),
+        create_result("Compare Responses Per Item Ratio", expected_messages),
     )
diff --git a/tests/rules/test_others.py b/tests/rules/test_others.py
index 491516b..b9f9b55 100755
--- a/tests/rules/test_others.py
+++ b/tests/rules/test_others.py
@@ -2,7 +2,7 @@
 
 from arche.rules.others import compare_boolean_fields, garbage_symbols
 from arche.rules.result import Level, Outcome
-from conftest import create_named_df, create_result
+from conftest import *
 import pandas as pd
 import pytest
 
@@ -64,8 +64,8 @@ def test_compare_boolean_fields(
     source_df = pd.DataFrame(source_data)
     target_df = pd.DataFrame(target_data)
     rule_result = compare_boolean_fields(source_df, target_df)
-    assert rule_result == create_result(
-        "Boolean Fields", expected_messages, expected_stats
+    assert_results_equal(
+        rule_result, create_result("Boolean Fields", expected_messages, expected_stats)
     )
 
 
@@ -112,6 +112,9 @@ def test_compare_boolean_fields(
     "raw_items, expected_messages, expected_items_count", dirty_inputs
 )
 def test_garbage_symbols(raw_items, expected_messages, expected_items_count):
-    assert garbage_symbols(pd.DataFrame(raw_items)) == create_result(
-        "Garbage Symbols", expected_messages, items_count=expected_items_count
+    assert_results_equal(
+        garbage_symbols(pd.DataFrame(raw_items)),
+        create_result(
+            "Garbage Symbols", expected_messages, items_count=expected_items_count
+        ),
     )
diff --git a/tests/rules/test_price.py b/tests/rules/test_price.py
index 237a13c..0f6fbc3 100755
--- a/tests/rules/test_price.py
+++ b/tests/rules/test_price.py
@@ -1,6 +1,6 @@
 import arche.rules.price as p
 from arche.rules.result import Level, Outcome
-from conftest import create_result
+from conftest import *
 import numpy as np
 import pandas as pd
 import pytest
@@ -51,9 +51,11 @@
 @pytest.mark.parametrize("data, tagged_fields, expected_messages", was_now_inputs)
 def test_compare_was_now(data, tagged_fields, expected_messages):
     df = pd.DataFrame(data)
-    result = p.compare_was_now(df, tagged_fields)
-    assert result == create_result(
-        "Compare Price Was And Now", expected_messages, items_count=len(df)
+    assert_results_equal(
+        p.compare_was_now(df, tagged_fields),
+        create_result(
+            "Compare Price Was And Now", expected_messages, items_count=len(df)
+        ),
     )
 
 
@@ -88,7 +90,9 @@ def test_compare_prices_for_same_urls(
     result = p.compare_prices_for_same_urls(
         pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields
     )
-    assert result == create_result("Compare Prices For Same Urls", expected_messages)
+    assert_results_equal(
+        result, create_result("Compare Prices For Same Urls", expected_messages)
+    )
 
 
 compare_names_inputs = [
@@ -122,7 +126,9 @@ def test_compare_names_for_same_urls(
     result = p.compare_names_for_same_urls(
         pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields
     )
-    assert result == create_result("Compare Names Per Url", expected_messages)
+    assert_results_equal(
+        result, create_result("Compare Names Per Url", expected_messages)
+    )
 
 
 @pytest.mark.parametrize(
@@ -152,4 +158,6 @@ def test_compare_prices_for_same_names(
     result = p.compare_prices_for_same_names(
         pd.DataFrame(source_data), pd.DataFrame(target_data), tagged_fields
     )
-    assert result == create_result("Compare Prices For Same Names", expected_messages)
+    assert_results_equal(
+        result, create_result("Compare Prices For Same Names", expected_messages)
+    )

From 96e4aa629b819655d0c5e665e2443e6b1d5bee11 Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Thu, 26 Sep 2019 15:48:24 -0300
Subject: [PATCH 09/10] Fix tqdm warning

---
 Pipfile                     |  1 +
 src/arche/readers/items.py  |  4 ++--
 src/arche/rules/category.py |  4 ++--
 src/arche/rules/others.py   |  6 ++----
 src/arche/tools/api.py      |  4 ++--
 src/arche/tools/schema.py   | 10 +++-------
 6 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/Pipfile b/Pipfile
index 1ff4a44..c53a891 100755
--- a/Pipfile
+++ b/Pipfile
@@ -35,6 +35,7 @@ recommonmark = "*"
 sphinxcontrib-golangdomain = {git = "https://bitbucket.org/ymotongpoo/sphinxcontrib-golangdomain"}
 sphinx-autoapi = {git = "https://github.com/rtfd/sphinx-autoapi"}
 nbsphinx = "*"
+sphinx_bootstrap_theme = "*"
 memory-profiler = "*"
 jupyter-console = "*"
 matplotlib = "*"
diff --git a/src/arche/readers/items.py b/src/arche/readers/items.py
index c33ac24..fb9478b 100755
--- a/src/arche/readers/items.py
+++ b/src/arche/readers/items.py
@@ -8,7 +8,7 @@
 import pandas as pd
 from scrapinghub import ScrapinghubClient
 from scrapinghub.client.jobs import Job
-from tqdm import tqdm_notebook
+from tqdm.notebook import tqdm
 
 RawItems = Iterable[Dict[str, Any]]
 
@@ -33,7 +33,7 @@ def categorize(df: pd.DataFrame) -> pd.DataFrame:
         """Cast columns with repeating values to `category` type to save memory"""
         if len(df) < 100:
             return
-        for c in tqdm_notebook(df.columns, desc="Categorizing"):
+        for c in tqdm(df.columns, desc="Categorizing"):
             try:
                 if df[c].nunique(dropna=False) <= 10:
                     df[c] = df[c].astype("category")
diff --git a/src/arche/rules/category.py b/src/arche/rules/category.py
index 5970acb..f4691b1 100755
--- a/src/arche/rules/category.py
+++ b/src/arche/rules/category.py
@@ -2,7 +2,7 @@
 
 from arche.rules.result import Outcome, Result
 import pandas as pd
-from tqdm import tqdm_notebook
+from tqdm.notebook import tqdm
 
 
 def get_difference(
@@ -97,7 +97,7 @@ def get_categories(df: pd.DataFrame, max_uniques: int = 10) -> Result:
     columns = find_likely_cats(df, max_uniques)
     result.stats = [
         value_counts
-        for value_counts in tqdm_notebook(
+        for value_counts in tqdm(
             map(lambda c: df[c].value_counts(dropna=False), columns),
             desc="Finding categories",
             total=len(columns),
diff --git a/src/arche/rules/others.py b/src/arche/rules/others.py
index 211872f..4851a2f 100755
--- a/src/arche/rules/others.py
+++ b/src/arche/rules/others.py
@@ -4,7 +4,7 @@
 from arche.rules.result import Outcome, Result
 import numpy as np
 import pandas as pd
-from tqdm import tqdm_notebook
+from tqdm.notebook import tqdm
 
 
 def compare_boolean_fields(
@@ -93,9 +93,7 @@ def garbage_symbols(df: pd.DataFrame) -> Result:
     row_keys = set()
     rule_result = Result("Garbage Symbols", items_count=len(df))
 
-    for column in tqdm_notebook(
-        df.select_dtypes([np.object]).columns, desc="Garbage Symbols"
-    ):
+    for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"):
         matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
         if not matches.empty:
             error_keys = df.loc[matches.unstack().index.values].index
diff --git a/src/arche/tools/api.py b/src/arche/tools/api.py
index cd56b2a..c5a30b8 100755
--- a/src/arche/tools/api.py
+++ b/src/arche/tools/api.py
@@ -10,7 +10,7 @@
 import numpy as np
 from scrapinghub import ScrapinghubClient
 from scrapinghub.client.jobs import Job
-from tqdm import tqdm, tqdm_notebook
+from tqdm import tqdm, notebook
 
 
 Filters = List[Tuple[str, str, str]]
@@ -163,7 +163,7 @@ def get_items(
     start_index: int,
     start: Optional[str],
     filters: Optional[Filters] = None,
-    p_bar: Union[tqdm, tqdm_notebook] = tqdm_notebook,
+    p_bar: Union[tqdm, notebook.tqdm] = notebook.tqdm,
     desc: Optional[str] = None,
 ) -> np.ndarray:
     source = get_source(key)
diff --git a/src/arche/tools/schema.py b/src/arche/tools/schema.py
index 3b607dc..a61eecc 100755
--- a/src/arche/tools/schema.py
+++ b/src/arche/tools/schema.py
@@ -10,7 +10,7 @@
 from genson import SchemaBuilder
 from jsonschema import FormatChecker, validators
 import pandas as pd
-from tqdm import tqdm_notebook
+from tqdm.notebook import tqdm
 
 
 def basic_json_schema(data_source: str, items_numbers: List[int] = None) -> Schema:
@@ -95,9 +95,7 @@ def fast_validate(
     errors = defaultdict(set)
 
     validate = fastjsonschema.compile(schema)
-    for i, raw_item in enumerate(
-        tqdm_notebook(raw_items, desc="Fast Schema Validation")
-    ):
+    for i, raw_item in enumerate(tqdm(raw_items, desc="Fast Schema Validation")):
         raw_item.pop("_type", None)
         raw_item.pop("_key", None)
         try:
@@ -117,9 +115,7 @@ def full_validate(
 
     validator = validators.validator_for(schema)(schema)
     validator.format_checker = FormatChecker()
-    for i, raw_item in enumerate(
-        tqdm_notebook(raw_items, desc="JSON Schema Validation")
-    ):
+    for i, raw_item in enumerate(tqdm(raw_items, desc="JSON Schema Validation")):
         raw_item.pop("_type", None)
         raw_item.pop("_key", None)
         for e in validator.iter_errors(raw_item):

From 933d05eed7f6daa71e6af8d70bed28c6090ea34b Mon Sep 17 00:00:00 2001
From: manycoding <manycoding@users.noreply.github.com>
Date: Mon, 30 Sep 2019 12:42:21 -0300
Subject: [PATCH 10/10] Add line to changes, describe err_thr

---
 CHANGES.md                 | 1 +
 src/arche/rules/compare.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 1e0cc67..9b6c73d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -14,6 +14,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
 ### Added
 - **Anomalies** to see significant deviations in fields coverage across multiple jobs, #138
 - Support to **Bitbucket API**, in order to access files from private repositories, #71
+- **Fields Difference** rule to find the difference between field values of two jobs. Supports normalization, nested fields, full access to the data, #167
 
 
 ## [0.3.6] (2019-07-12)
diff --git a/src/arche/rules/compare.py b/src/arche/rules/compare.py
index eb4a237..73c2783 100644
--- a/src/arche/rules/compare.py
+++ b/src/arche/rules/compare.py
@@ -12,11 +12,12 @@ def fields(
     normalize: bool = False,
     err_thr: float = 0.25,
 ) -> Result:
-    """Return fields values difference between dataframe.
+    """Finds fields values difference between dataframes.
 
     Args:
         names - a list of field names
         normalize - if set, all fields converted to str and processed with lower() and strip()
+        err_thr - sets the failure threshold for missing values
 
     Returns:
         Result with same, missing and new values.