feat(registry): generalize registry

Generalized the registry pattern from profiles and comparisons to a single class
ing-bank · Jul 4, 2022 · d01c68a · d01c68a
1 parent 78bae81
commit d01c68a
Show file tree

Hide file tree

Showing 10 changed files with 199 additions and 98 deletions.
diff --git a/popmon/analysis/__init__.py b/popmon/analysis/__init__.py
@@ -19,5 +19,7 @@
 
 
 from ..analysis.apply_func import ApplyFunc
+from .comparison import Comparisons
+from .profiling import Profiles
 
-__all__ = ["ApplyFunc"]
+__all__ = ["ApplyFunc", "Comparisons", "Profiles"]
diff --git a/popmon/analysis/comparison/__init__.py b/popmon/analysis/comparison/__init__.py
@@ -16,10 +16,6 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-
-import popmon.analysis.comparison.comparisons  # noqa
-
 from ...analysis.comparison.hist_comparer import (
     ExpandingHistComparer,
     ExpandingNormHistComparer,
@@ -28,8 +24,10 @@
     RollingHistComparer,
     RollingNormHistComparer,
 )
+from .comparisons import Comparisons
 
 __all__ = [
+    "Comparisons",
     "ReferenceHistComparer",
     "RollingHistComparer",
     "ExpandingHistComparer",

diff --git a/popmon/analysis/comparison/comparisons.py b/popmon/analysis/comparison/comparisons.py
@@ -21,12 +21,15 @@
 import numpy as np
 from scipy import stats
 
-from popmon.analysis.comparison.comparison_registry import Comparisons
+from popmon.base.registry import Registry
+
+Comparisons = Registry()
 
 
 @Comparisons.register(
     key="max_prob_diff",
     description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})",
+    htype="all",
 )
 def googl_test(bins_1, bins_2):
     """Google-paper test
@@ -47,7 +50,7 @@ def dist(bins):
     return np.max(np.abs(dist(bins_1) - dist(bins_2)))
 
 
-@Comparisons.register(key="psi", description="Population Stability Index")
+@Comparisons.register(key="psi", description="Population Stability Index", htype="all")
 def population_stability_index(po, qo):
     epsilon = 10e-6
     p = po.copy()
@@ -66,7 +69,7 @@ def kullback_leibler_divergence(po, qo):
     return np.sum(p * np.log(p / q))
 
 
-@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence")
+@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence", htype="all")
 def jensen_shannon_divergence(p, q):
     m = 0.5 * (p + q)
     return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m))
@@ -177,6 +180,7 @@ def unknown_labels(hist1, hist2):
     key="pearson",
     description="Pearson correlation between each time slot and {ref}",
     dim=(2,),
+    htype="all",
 )
 def pearson(p, q, *args):
     # calculate pearson coefficient
@@ -264,6 +268,7 @@ def _not_finite_to_zero(x):
         "The number of normalized residuals of all bin pairs (one histogram in a time"
         + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).",
     ],
+    htype="all",
 )
 def chi2(*args, max_res_bound=7.0):
     chi2r, chi2_norm, zscore, pvalue, res = uu_chi2(*args)

diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py
@@ -52,7 +52,7 @@ def hist_compare(row, hist_name1="", hist_name2=""):
     :param str hist_name2: name of histogram two to compare
     :return: pandas Series with popular comparison metrics.
     """
-    from .comparison_registry import Comparisons
+    from popmon.analysis.comparison import Comparisons
 
     x = {key: np.nan for key in Comparisons.get_keys()}
 
@@ -82,36 +82,15 @@ def hist_compare(row, hist_name1="", hist_name2=""):
             htype = "cat"
             args = [hist1, hist2]
 
-        for key, func in Comparisons.get_comparisons(dim=1, htype=htype).items():
-            results = func(*args)
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                x[k] = v
-
-        for key, func in Comparisons.get_comparisons(dim=1, htype="all").items():
-            results = func(*entries_list)
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                x[k] = v
+        x.update(Comparisons.run(args, dim=1, htype=htype))
+        x.update(Comparisons.run(entries_list, dim=1, htype="all"))
     else:
         numpy_ndgrids = get_consistent_numpy_ndgrids([hist1, hist2], dim=hist1.n_dim)
         entries_list = [entry.flatten() for entry in numpy_ndgrids]
 
-        for key, func in Comparisons.get_comparisons(dim=(2,)).items():
-            results = func(*entries_list)
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                x[k] = v
-
-    for key, func in Comparisons.get_comparisons(dim=-1).items():
-        results = func(*entries_list)
-        if len(key) == 1:
-            results = (results,)
-        for k, v in zip(key, results):
-            x[k] = v
+        x.update(Comparisons.run(entries_list, dim=(2,), htype="all"))
+
+    x.update(Comparisons.run(entries_list, dim=-1, htype="all"))
 
     if len(set(x.keys()) - set(Comparisons.get_keys())) > 0:
         raise ValueError("Could not compute full comparison")

diff --git a/popmon/analysis/profiling/__init__.py b/popmon/analysis/profiling/__init__.py
@@ -16,18 +16,16 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-
-import popmon.analysis.profiling.profiles  # noqa
-
 from ...analysis.profiling.hist_profiler import HistProfiler
 from ...analysis.profiling.pull_calculator import (
     ExpandingPullCalculator,
     ReferencePullCalculator,
     RollingPullCalculator,
 )
+from .profiles import Profiles
 
 __all__ = [
+    "Profiles",
     "HistProfiler",
     "RollingPullCalculator",
     "ReferencePullCalculator",

diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py
@@ -21,8 +21,6 @@
 import numpy as np
 import pandas as pd
 
-from popmon.analysis.profiling.profile_registry import Profiles
-
 from ...base import Module
 from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp
 
@@ -76,6 +74,8 @@ def __init__(
             raise NotImplementedError()
 
     def _profile_1d_histogram(self, name, hist):
+        from popmon.analysis import Profiles
+
         # preprocessing value counts and TS
         is_num = is_numeric(hist)
         is_ts = is_timestamp(hist) or name in self.var_timestamp
@@ -95,29 +95,13 @@ def _profile_1d_histogram(self, name, hist):
 
         # calc 1d-histogram statistics
         profile = {}
-        for key, func in Profiles.get_profiles(dim=1, htype=otype).items():
-            args = [bin_labels, bin_counts]
-            results = func(*args)
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                profile[k] = v
-
-        for key, func in Profiles.get_profiles(dim=1, htype="all").items():
-            args = [bin_labels, bin_counts]
-            results = func(*args)
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                profile[k] = v
-
-        for key, func in Profiles.get_profiles(dim=1, htype=None).items():
-            args = [hist]
-            results = func(*args)
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                profile[k] = v
+        args = [bin_labels, bin_counts]
+
+        profile.update(Profiles.run(args, dim=1, htype=otype))
+        profile.update(Profiles.run(args, dim=1, htype="all"))
+
+        # difference between htype=None and htype="all" are arguments (bin labels vs hist)
+        profile.update(Profiles.run([hist], dim=1, htype=None))
 
         # postprocessing TS
         if is_ts:
@@ -132,25 +116,26 @@ def _profile_1d_histogram(self, name, hist):
         return profile
 
     def _profile_nd_histogram(self, name, hist, dim):
+        from popmon.analysis import Profiles
+
         if hist.n_dim < dim:
             self.logger.warning(
                 f"Histogram {name} has {hist.n_dim} dimensions (<{dim}); cannot profile. Returning empty."
             )
             return {}
 
         # calc nd-histogram statistics
-        profile = {}
-        for key, func in Profiles.get_profiles(dim=dim).items():
-            results = func(hist)
-
-            if len(key) == 1:
-                results = (results,)
-            for k, v in zip(key, results):
-                profile[k] = v
+        profile = Profiles.run([hist], dim=dim, htype=None)
+        profile.update(Profiles.run([hist], dim=dim, htype="all"))
+        profile.update(Profiles.run([hist], dim=dim, htype="num"))
+        profile.update(Profiles.run([hist], dim=dim, htype="cat"))
 
+        profile.update(Profiles.run([hist], dim=-1, htype=None))
         return profile
 
     def _profile_hist(self, split, hist_name):
+        from popmon.analysis.profiling import Profiles
+
         if len(split) == 0:
             self.logger.error(f'Split histograms dict "{hist_name}" is empty. Return.')
             return []
@@ -162,18 +147,18 @@ def _profile_hist(self, split, hist_name):
 
         # these are the profiled quantities we will monitor
         expected_fields = (
-            Profiles.get_keys(dim=dimension, htype=htype)
-            + Profiles.get_keys(dim=dimension, htype="all")
-            + Profiles.get_keys(dim=dimension, htype=None)
+            Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=htype)
+            + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype="all")
+            + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=None)
         )
 
         # profiles regardless of dim and htype (e.g. count)
-        expected_fields += Profiles.get_keys(dim=None, htype=None)
+        expected_fields += Profiles.get_keys_by_dim_and_htype(dim=None, htype=None)
 
         # profiles regardless of dim
-        expected_fields += Profiles.get_keys(dim=-1, htype=htype)
-        expected_fields += Profiles.get_keys(dim=-1, htype="all")
-        expected_fields += Profiles.get_keys(dim=-1, htype=None)
+        expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=htype)
+        expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype="all")
+        expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=None)
 
         expected_fields += [self.index_col, self.hist_col]
 
@@ -193,7 +178,8 @@ def _profile_hist(self, split, hist_name):
 
             if sorted(profile.keys()) != sorted(expected_fields):
                 self.logger.error(
-                    f'Could not extract full profile for sub-hist "{hist_name} {index}". Skipping.'
+                    f'Could not extract full profile for sub-hist "{hist_name} {index}".'
+                    f"Differences: {set(profile.keys()).symmetric_difference(set(expected_fields))}. Skipping."
                 )
             else:
                 profile_list.append(profile)

diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py
@@ -20,11 +20,14 @@
 
 import numpy as np
 
+from popmon.base.registry import Registry
+
 from ...analysis.hist_numpy import get_2dgrid
-from ...analysis.profiling.profile_registry import Profiles
 from ...hist.hist_utils import sum_entries
 from ...stats import numpy as pm_np
 
+Profiles = Registry()
+
 
 @Profiles.register(
     key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"],
@@ -43,8 +46,10 @@
     htype="num",
 )
 def profile_quantiles(x, w):
-    return pm_np.quantile(
-        x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w
+    return tuple(
+        pm_np.quantile(
+            x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w
+        )
     )
 
 
@@ -58,7 +63,9 @@ def profile_std(x, w):
     return pm_np.std(x, w)
 
 
-@Profiles.register(key="nan", description="Number of missing entries (NaN)", dim=1)
+@Profiles.register(
+    key="nan", description="Number of missing entries (NaN)", dim=1, htype=None
+)
 def profile_nan(hist):
     if hasattr(hist, "nanflow"):
         return hist.nanflow.entries
@@ -71,6 +78,7 @@ def profile_nan(hist):
     key="overflow",
     description="Number of values larger than the maximum bin-edge of the histogram.",
     dim=1,
+    htype=None,
 )
 def profile_overflow(hist):
     if hasattr(hist, "overflow"):
@@ -82,6 +90,7 @@ def profile_overflow(hist):
     key="underflow",
     description="Number of values smaller than the minimum bin-edge of the histogram.",
     dim=1,
+    htype=None,
 )
 def profile_underflow(hist):
     if hasattr(hist, "underflow"):
@@ -93,6 +102,7 @@ def profile_underflow(hist):
     key="phik",
     description="phi-k correlation between the two variables of the histogram",
     dim=2,
+    htype=None,
 )
 def profile_phik(hist):
     from phik import phik
@@ -114,7 +124,7 @@ def profile_phik(hist):
 
 
 @Profiles.register(
-    key="count", description="Number of entries (non-NaN and NaN)", dim=None
+    key="count", description="Number of entries (non-NaN and NaN)", dim=-1, htype=None
 )
 def profile_count(hist):
     return int(sum_entries(hist))
@@ -137,7 +147,8 @@ def profile_distinct(bin_labels, bin_counts):
     return len(np.unique(bin_labels[bin_counts > 0]))
 
 
-def fraction_of_true(bin_labels, bin_entries):
+@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat")
+def profile_fraction_of_true(bin_labels, bin_counts):
     """Compute fraction of 'true' labels
 
     :param bin_labels: Array containing numbers whose mean is desired. If `a` is not an
@@ -147,7 +158,7 @@ def fraction_of_true(bin_labels, bin_entries):
     :return: fraction of 'true' labels
     """
     bin_labels = np.array(bin_labels)
-    bin_entries = np.array(bin_entries)
+    bin_entries = np.array(bin_counts)
     assert len(bin_labels) == len(bin_entries)
 
     def replace(bl):
@@ -192,11 +203,6 @@ def replace(bl):
     return (1.0 * sum_true) / sum_entries
 
 
-@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat")
-def profile_fraction_of_true(bin_labels, bin_counts):
-    return fraction_of_true(bin_labels, bin_counts)
-
-
 @Profiles.register(
     key="most_probable_value", description="Most probable value", dim=1, htype="all"
 )