diff --git a/popmon/analysis/__init__.py b/popmon/analysis/__init__.py index a45ccb40..efe44800 100644 --- a/popmon/analysis/__init__.py +++ b/popmon/analysis/__init__.py @@ -19,5 +19,7 @@ from ..analysis.apply_func import ApplyFunc +from .comparison import Comparisons +from .profiling import Profiles -__all__ = ["ApplyFunc"] +__all__ = ["ApplyFunc", "Comparisons", "Profiles"] diff --git a/popmon/analysis/comparison/__init__.py b/popmon/analysis/comparison/__init__.py index 209fa8e2..388ec3ef 100644 --- a/popmon/analysis/comparison/__init__.py +++ b/popmon/analysis/comparison/__init__.py @@ -16,10 +16,6 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import popmon.analysis.comparison.comparisons # noqa - from ...analysis.comparison.hist_comparer import ( ExpandingHistComparer, ExpandingNormHistComparer, @@ -28,8 +24,10 @@ RollingHistComparer, RollingNormHistComparer, ) +from .comparisons import Comparisons __all__ = [ + "Comparisons", "ReferenceHistComparer", "RollingHistComparer", "ExpandingHistComparer", diff --git a/popmon/analysis/comparison/comparisons.py b/popmon/analysis/comparison/comparisons.py index 099b850c..510705d2 100644 --- a/popmon/analysis/comparison/comparisons.py +++ b/popmon/analysis/comparison/comparisons.py @@ -21,12 +21,15 @@ import numpy as np from scipy import stats -from popmon.analysis.comparison.comparison_registry import Comparisons +from popmon.base.registry import Registry + +Comparisons = Registry() @Comparisons.register( key="max_prob_diff", description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})", + htype="all", ) def googl_test(bins_1, bins_2): """Google-paper test @@ -47,7 +50,7 @@ def dist(bins): return np.max(np.abs(dist(bins_1) - dist(bins_2))) -@Comparisons.register(key="psi", description="Population Stability Index") +@Comparisons.register(key="psi", description="Population Stability Index", htype="all") def population_stability_index(po, qo): epsilon = 10e-6 p = po.copy() @@ -66,7 +69,7 @@ def kullback_leibler_divergence(po, qo): return np.sum(p * np.log(p / q)) -@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence") +@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence", htype="all") def jensen_shannon_divergence(p, q): m = 0.5 * (p + q) return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m)) @@ -177,6 +180,7 @@ def unknown_labels(hist1, hist2): key="pearson", description="Pearson correlation between each time slot and {ref}", dim=(2,), + htype="all", ) def pearson(p, q, *args): # calculate pearson coefficient @@ -264,6 +268,7 @@ def _not_finite_to_zero(x): "The number of normalized residuals of all bin pairs (one histogram in a time" + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).", ], + htype="all", ) def chi2(*args, max_res_bound=7.0): chi2r, chi2_norm, zscore, pvalue, res = uu_chi2(*args) diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 750ea2f7..9dc25b31 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -52,7 +52,7 @@ def hist_compare(row, hist_name1="", hist_name2=""): :param str hist_name2: name of histogram two to compare :return: pandas Series with popular comparison metrics. """ - from .comparison_registry import Comparisons + from popmon.analysis.comparison import Comparisons x = {key: np.nan for key in Comparisons.get_keys()} @@ -82,36 +82,15 @@ def hist_compare(row, hist_name1="", hist_name2=""): htype = "cat" args = [hist1, hist2] - for key, func in Comparisons.get_comparisons(dim=1, htype=htype).items(): - results = func(*args) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - x[k] = v - - for key, func in Comparisons.get_comparisons(dim=1, htype="all").items(): - results = func(*entries_list) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - x[k] = v + x.update(Comparisons.run(args, dim=1, htype=htype)) + x.update(Comparisons.run(entries_list, dim=1, htype="all")) else: numpy_ndgrids = get_consistent_numpy_ndgrids([hist1, hist2], dim=hist1.n_dim) entries_list = [entry.flatten() for entry in numpy_ndgrids] - for key, func in Comparisons.get_comparisons(dim=(2,)).items(): - results = func(*entries_list) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - x[k] = v - - for key, func in Comparisons.get_comparisons(dim=-1).items(): - results = func(*entries_list) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - x[k] = v + x.update(Comparisons.run(entries_list, dim=(2,), htype="all")) + + x.update(Comparisons.run(entries_list, dim=-1, htype="all")) if len(set(x.keys()) - set(Comparisons.get_keys())) > 0: raise ValueError("Could not compute full comparison") diff --git a/popmon/analysis/profiling/__init__.py b/popmon/analysis/profiling/__init__.py index 035ba17c..5229425e 100644 --- a/popmon/analysis/profiling/__init__.py +++ b/popmon/analysis/profiling/__init__.py @@ -16,18 +16,16 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -import popmon.analysis.profiling.profiles # noqa - from ...analysis.profiling.hist_profiler import HistProfiler from ...analysis.profiling.pull_calculator import ( ExpandingPullCalculator, ReferencePullCalculator, RollingPullCalculator, ) +from .profiles import Profiles __all__ = [ + "Profiles", "HistProfiler", "RollingPullCalculator", "ReferencePullCalculator", diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 10380ae2..9db85b43 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -21,8 +21,6 @@ import numpy as np import pandas as pd -from popmon.analysis.profiling.profile_registry import Profiles - from ...base import Module from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp @@ -76,6 +74,8 @@ def __init__( raise NotImplementedError() def _profile_1d_histogram(self, name, hist): + from popmon.analysis import Profiles + # preprocessing value counts and TS is_num = is_numeric(hist) is_ts = is_timestamp(hist) or name in self.var_timestamp @@ -95,29 +95,13 @@ def _profile_1d_histogram(self, name, hist): # calc 1d-histogram statistics profile = {} - for key, func in Profiles.get_profiles(dim=1, htype=otype).items(): - args = [bin_labels, bin_counts] - results = func(*args) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - profile[k] = v - - for key, func in Profiles.get_profiles(dim=1, htype="all").items(): - args = [bin_labels, bin_counts] - results = func(*args) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - profile[k] = v - - for key, func in Profiles.get_profiles(dim=1, htype=None).items(): - args = [hist] - results = func(*args) - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - profile[k] = v + args = [bin_labels, bin_counts] + + profile.update(Profiles.run(args, dim=1, htype=otype)) + profile.update(Profiles.run(args, dim=1, htype="all")) + + # difference between htype=None and htype="all" are arguments (bin labels vs hist) + profile.update(Profiles.run([hist], dim=1, htype=None)) # postprocessing TS if is_ts: @@ -132,6 +116,8 @@ def _profile_1d_histogram(self, name, hist): return profile def _profile_nd_histogram(self, name, hist, dim): + from popmon.analysis import Profiles + if hist.n_dim < dim: self.logger.warning( f"Histogram {name} has {hist.n_dim} dimensions (<{dim}); cannot profile. Returning empty." @@ -139,18 +125,17 @@ def _profile_nd_histogram(self, name, hist, dim): return {} # calc nd-histogram statistics - profile = {} - for key, func in Profiles.get_profiles(dim=dim).items(): - results = func(hist) - - if len(key) == 1: - results = (results,) - for k, v in zip(key, results): - profile[k] = v + profile = Profiles.run([hist], dim=dim, htype=None) + profile.update(Profiles.run([hist], dim=dim, htype="all")) + profile.update(Profiles.run([hist], dim=dim, htype="num")) + profile.update(Profiles.run([hist], dim=dim, htype="cat")) + profile.update(Profiles.run([hist], dim=-1, htype=None)) return profile def _profile_hist(self, split, hist_name): + from popmon.analysis.profiling import Profiles + if len(split) == 0: self.logger.error(f'Split histograms dict "{hist_name}" is empty. Return.') return [] @@ -162,18 +147,18 @@ def _profile_hist(self, split, hist_name): # these are the profiled quantities we will monitor expected_fields = ( - Profiles.get_keys(dim=dimension, htype=htype) - + Profiles.get_keys(dim=dimension, htype="all") - + Profiles.get_keys(dim=dimension, htype=None) + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=htype) + + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype="all") + + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=None) ) # profiles regardless of dim and htype (e.g. count) - expected_fields += Profiles.get_keys(dim=None, htype=None) + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=None, htype=None) # profiles regardless of dim - expected_fields += Profiles.get_keys(dim=-1, htype=htype) - expected_fields += Profiles.get_keys(dim=-1, htype="all") - expected_fields += Profiles.get_keys(dim=-1, htype=None) + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=htype) + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype="all") + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=None) expected_fields += [self.index_col, self.hist_col] @@ -193,7 +178,8 @@ def _profile_hist(self, split, hist_name): if sorted(profile.keys()) != sorted(expected_fields): self.logger.error( - f'Could not extract full profile for sub-hist "{hist_name} {index}". Skipping.' + f'Could not extract full profile for sub-hist "{hist_name} {index}".' + f"Differences: {set(profile.keys()).symmetric_difference(set(expected_fields))}. Skipping." ) else: profile_list.append(profile) diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py index 334618f5..bc4d48da 100644 --- a/popmon/analysis/profiling/profiles.py +++ b/popmon/analysis/profiling/profiles.py @@ -20,11 +20,14 @@ import numpy as np +from popmon.base.registry import Registry + from ...analysis.hist_numpy import get_2dgrid -from ...analysis.profiling.profile_registry import Profiles from ...hist.hist_utils import sum_entries from ...stats import numpy as pm_np +Profiles = Registry() + @Profiles.register( key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"], @@ -43,8 +46,10 @@ htype="num", ) def profile_quantiles(x, w): - return pm_np.quantile( - x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w + return tuple( + pm_np.quantile( + x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w + ) ) @@ -58,7 +63,9 @@ def profile_std(x, w): return pm_np.std(x, w) -@Profiles.register(key="nan", description="Number of missing entries (NaN)", dim=1) +@Profiles.register( + key="nan", description="Number of missing entries (NaN)", dim=1, htype=None +) def profile_nan(hist): if hasattr(hist, "nanflow"): return hist.nanflow.entries @@ -71,6 +78,7 @@ def profile_nan(hist): key="overflow", description="Number of values larger than the maximum bin-edge of the histogram.", dim=1, + htype=None, ) def profile_overflow(hist): if hasattr(hist, "overflow"): @@ -82,6 +90,7 @@ def profile_overflow(hist): key="underflow", description="Number of values smaller than the minimum bin-edge of the histogram.", dim=1, + htype=None, ) def profile_underflow(hist): if hasattr(hist, "underflow"): @@ -93,6 +102,7 @@ def profile_underflow(hist): key="phik", description="phi-k correlation between the two variables of the histogram", dim=2, + htype=None, ) def profile_phik(hist): from phik import phik @@ -114,7 +124,7 @@ def profile_phik(hist): @Profiles.register( - key="count", description="Number of entries (non-NaN and NaN)", dim=None + key="count", description="Number of entries (non-NaN and NaN)", dim=-1, htype=None ) def profile_count(hist): return int(sum_entries(hist)) @@ -137,7 +147,8 @@ def profile_distinct(bin_labels, bin_counts): return len(np.unique(bin_labels[bin_counts > 0])) -def fraction_of_true(bin_labels, bin_entries): +@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat") +def profile_fraction_of_true(bin_labels, bin_counts): """Compute fraction of 'true' labels :param bin_labels: Array containing numbers whose mean is desired. If `a` is not an @@ -147,7 +158,7 @@ def fraction_of_true(bin_labels, bin_entries): :return: fraction of 'true' labels """ bin_labels = np.array(bin_labels) - bin_entries = np.array(bin_entries) + bin_entries = np.array(bin_counts) assert len(bin_labels) == len(bin_entries) def replace(bl): @@ -192,11 +203,6 @@ def replace(bl): return (1.0 * sum_true) / sum_entries -@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat") -def profile_fraction_of_true(bin_labels, bin_counts): - return fraction_of_true(bin_labels, bin_counts) - - @Profiles.register( key="most_probable_value", description="Most probable value", dim=1, htype="all" ) diff --git a/popmon/base/registry.py b/popmon/base/registry.py new file mode 100644 index 00000000..1bc2b7db --- /dev/null +++ b/popmon/base/registry.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022 ING Wholesale Banking Advanced Analytics +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +from collections import defaultdict +from typing import Callable, Dict, List, Optional, Tuple, Union + + +class Registry: + _properties = ("dim", "htype") + + def __init__(self): + self._keys: List[str] = [] + self._descriptions: Dict[str, str] = {} + self._properties_to_func = defaultdict(lambda: defaultdict(dict)) + self._func_name_to_properties = {} + + def register( + self, + key: Union[str, List[str], Tuple[str]], + description: Union[str, List[str], Tuple[str]], + dim: int = -1, + htype: Optional[str] = None, + ): + # rename for function use, without changing api + keys = key + del key + + descriptions = description + del description + + # ensure that keys are a tuple + if isinstance(keys, list): + keys = tuple(keys) + elif not isinstance(keys, tuple): + keys = (keys,) + + # ensure that description is a tuple + if isinstance(descriptions, list): + descriptions = tuple(descriptions) + elif not isinstance(descriptions, tuple): + descriptions = (descriptions,) + + def f(func: Callable): + # function names should be unique + if func.__name__ in self._func_name_to_properties: + raise ValueError( + f"A function with the name '{func.__name__}' has already been registered." + ) + + # keys should unique correspond to a function + for key in keys: + if key in self._keys: + raise ValueError(f"Key '{key}' has already been registered.") + + # register properties + self._keys += list(keys) + self._func_name_to_properties[func.__name__] = (dim, htype, keys) + self._properties_to_func[dim][htype][keys] = func + self._descriptions.update(dict(zip(keys, descriptions))) + + return func + + return f + + # Methods + def _get_func_properties_by_name( + self, function_name: str + ) -> Tuple[int, str, Tuple[str]]: + return self._func_name_to_properties[function_name] + + def get_func_by_name(self, function_name: str) -> Callable: + """ + Get a function by the function name + + Parameters + ---------- + function_name: name of the original function + """ + dim, htype, key = self._get_func_properties_by_name(function_name) + return self._properties_to_func[dim][htype][key] + + def get_func_by_dim_and_htype(self, dim, htype) -> Dict[Tuple[str], Callable]: + return self._properties_to_func[dim][htype] + + def get_keys(self) -> List[str]: + """List of keys associated with registered functions""" + return self._keys + + def get_keys_by_dim_and_htype(self, dim, htype) -> List[str]: + """Flat list of keys for a provided dimension and histogram type""" + return [ + v for values in self._properties_to_func[dim][htype].keys() for v in values + ] + + def get_descriptions(self) -> Dict[str, str]: + """Dictionary of key->description associated with registered functions""" + return self._descriptions + + def update_func(self, name, func) -> None: + dim, htype, key = self._func_name_to_properties[name] + self._properties_to_func[dim][htype][key] = func + + def run(self, args, dim, htype): + output = {} + for key, func in self.get_func_by_dim_and_htype(dim=dim, htype=htype).items(): + results = func(*args) + if not isinstance(results, tuple): + results = (results,) + output.update(dict(zip(key, results))) + return output diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index f3c8ef31..601c5eb1 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -224,7 +224,7 @@ def __init__( :param list features: features of histograms to pick up from input data (optional) :return: assembled self reference pipeline """ - from popmon.analysis.comparison.comparison_registry import Comparisons + from popmon.analysis.comparison import Comparisons reference_prefix = "ref" reference_modules: List[Union[Module, Pipeline]] = [ @@ -284,7 +284,7 @@ def __init__( :param list features: features of histograms to pick up from input data (optional) :return: assembled external reference pipeline """ - from popmon.analysis.comparison.comparison_registry import Comparisons + from popmon.analysis.comparison import Comparisons reference_prefix = "ref" reference_modules: List[Union[Module, Pipeline]] = [ @@ -347,7 +347,7 @@ def __init__( :param list features: features of histograms to pick up from input data (optional) :return: assembled rolling reference pipeline """ - from popmon.analysis.comparison.comparison_registry import Comparisons + from popmon.analysis.comparison import Comparisons reference_prefix = "roll" reference_modules: List[Union[Module, Pipeline]] = [ @@ -408,7 +408,7 @@ def __init__( :param list features: features of histograms to pick up from input data (optional) :return: assembled expanding reference pipeline """ - from popmon.analysis.comparison.comparison_registry import Comparisons + from popmon.analysis.comparison import Comparisons reference_prefix = "expanding" reference_modules: List[Union[Module, Pipeline]] = [ diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index 437acc2b..120523cb 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -24,8 +24,8 @@ import pandas as pd from tqdm import tqdm -from popmon.analysis.comparison.comparison_registry import Comparisons -from popmon.analysis.profiling.profile_registry import Profiles +from popmon.analysis.comparison import Comparisons +from popmon.analysis.profiling import Profiles from ..base import Module from ..config import Report