Skip to content

Commit

Permalink
feat(registry): generalize registry
Browse files Browse the repository at this point in the history
Generalized the registry pattern from profiles
and comparisons to a single class
  • Loading branch information
sbrugman committed Jul 4, 2022
1 parent 78bae81 commit d01c68a
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 98 deletions.
4 changes: 3 additions & 1 deletion popmon/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,7 @@


from ..analysis.apply_func import ApplyFunc
from .comparison import Comparisons
from .profiling import Profiles

__all__ = ["ApplyFunc"]
__all__ = ["ApplyFunc", "Comparisons", "Profiles"]
6 changes: 2 additions & 4 deletions popmon/analysis/comparison/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import popmon.analysis.comparison.comparisons # noqa

from ...analysis.comparison.hist_comparer import (
ExpandingHistComparer,
ExpandingNormHistComparer,
Expand All @@ -28,8 +24,10 @@
RollingHistComparer,
RollingNormHistComparer,
)
from .comparisons import Comparisons

__all__ = [
"Comparisons",
"ReferenceHistComparer",
"RollingHistComparer",
"ExpandingHistComparer",
Expand Down
11 changes: 8 additions & 3 deletions popmon/analysis/comparison/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@
import numpy as np
from scipy import stats

from popmon.analysis.comparison.comparison_registry import Comparisons
from popmon.base.registry import Registry

Comparisons = Registry()


@Comparisons.register(
key="max_prob_diff",
description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})",
htype="all",
)
def googl_test(bins_1, bins_2):
"""Google-paper test
Expand All @@ -47,7 +50,7 @@ def dist(bins):
return np.max(np.abs(dist(bins_1) - dist(bins_2)))


@Comparisons.register(key="psi", description="Population Stability Index")
@Comparisons.register(key="psi", description="Population Stability Index", htype="all")
def population_stability_index(po, qo):
epsilon = 10e-6
p = po.copy()
Expand All @@ -66,7 +69,7 @@ def kullback_leibler_divergence(po, qo):
return np.sum(p * np.log(p / q))


@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence")
@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence", htype="all")
def jensen_shannon_divergence(p, q):
m = 0.5 * (p + q)
return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m))
Expand Down Expand Up @@ -177,6 +180,7 @@ def unknown_labels(hist1, hist2):
key="pearson",
description="Pearson correlation between each time slot and {ref}",
dim=(2,),
htype="all",
)
def pearson(p, q, *args):
# calculate pearson coefficient
Expand Down Expand Up @@ -264,6 +268,7 @@ def _not_finite_to_zero(x):
"The number of normalized residuals of all bin pairs (one histogram in a time"
+ " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).",
],
htype="all",
)
def chi2(*args, max_res_bound=7.0):
chi2r, chi2_norm, zscore, pvalue, res = uu_chi2(*args)
Expand Down
33 changes: 6 additions & 27 deletions popmon/analysis/comparison/hist_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def hist_compare(row, hist_name1="", hist_name2=""):
:param str hist_name2: name of histogram two to compare
:return: pandas Series with popular comparison metrics.
"""
from .comparison_registry import Comparisons
from popmon.analysis.comparison import Comparisons

x = {key: np.nan for key in Comparisons.get_keys()}

Expand Down Expand Up @@ -82,36 +82,15 @@ def hist_compare(row, hist_name1="", hist_name2=""):
htype = "cat"
args = [hist1, hist2]

for key, func in Comparisons.get_comparisons(dim=1, htype=htype).items():
results = func(*args)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
x[k] = v

for key, func in Comparisons.get_comparisons(dim=1, htype="all").items():
results = func(*entries_list)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
x[k] = v
x.update(Comparisons.run(args, dim=1, htype=htype))
x.update(Comparisons.run(entries_list, dim=1, htype="all"))
else:
numpy_ndgrids = get_consistent_numpy_ndgrids([hist1, hist2], dim=hist1.n_dim)
entries_list = [entry.flatten() for entry in numpy_ndgrids]

for key, func in Comparisons.get_comparisons(dim=(2,)).items():
results = func(*entries_list)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
x[k] = v

for key, func in Comparisons.get_comparisons(dim=-1).items():
results = func(*entries_list)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
x[k] = v
x.update(Comparisons.run(entries_list, dim=(2,), htype="all"))

x.update(Comparisons.run(entries_list, dim=-1, htype="all"))

if len(set(x.keys()) - set(Comparisons.get_keys())) > 0:
raise ValueError("Could not compute full comparison")
Expand Down
6 changes: 2 additions & 4 deletions popmon/analysis/profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,16 @@
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import popmon.analysis.profiling.profiles # noqa

from ...analysis.profiling.hist_profiler import HistProfiler
from ...analysis.profiling.pull_calculator import (
ExpandingPullCalculator,
ReferencePullCalculator,
RollingPullCalculator,
)
from .profiles import Profiles

__all__ = [
"Profiles",
"HistProfiler",
"RollingPullCalculator",
"ReferencePullCalculator",
Expand Down
68 changes: 27 additions & 41 deletions popmon/analysis/profiling/hist_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import numpy as np
import pandas as pd

from popmon.analysis.profiling.profile_registry import Profiles

from ...base import Module
from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp

Expand Down Expand Up @@ -76,6 +74,8 @@ def __init__(
raise NotImplementedError()

def _profile_1d_histogram(self, name, hist):
from popmon.analysis import Profiles

# preprocessing value counts and TS
is_num = is_numeric(hist)
is_ts = is_timestamp(hist) or name in self.var_timestamp
Expand All @@ -95,29 +95,13 @@ def _profile_1d_histogram(self, name, hist):

# calc 1d-histogram statistics
profile = {}
for key, func in Profiles.get_profiles(dim=1, htype=otype).items():
args = [bin_labels, bin_counts]
results = func(*args)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
profile[k] = v

for key, func in Profiles.get_profiles(dim=1, htype="all").items():
args = [bin_labels, bin_counts]
results = func(*args)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
profile[k] = v

for key, func in Profiles.get_profiles(dim=1, htype=None).items():
args = [hist]
results = func(*args)
if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
profile[k] = v
args = [bin_labels, bin_counts]

profile.update(Profiles.run(args, dim=1, htype=otype))
profile.update(Profiles.run(args, dim=1, htype="all"))

# difference between htype=None and htype="all" are arguments (bin labels vs hist)
profile.update(Profiles.run([hist], dim=1, htype=None))

# postprocessing TS
if is_ts:
Expand All @@ -132,25 +116,26 @@ def _profile_1d_histogram(self, name, hist):
return profile

def _profile_nd_histogram(self, name, hist, dim):
from popmon.analysis import Profiles

if hist.n_dim < dim:
self.logger.warning(
f"Histogram {name} has {hist.n_dim} dimensions (<{dim}); cannot profile. Returning empty."
)
return {}

# calc nd-histogram statistics
profile = {}
for key, func in Profiles.get_profiles(dim=dim).items():
results = func(hist)

if len(key) == 1:
results = (results,)
for k, v in zip(key, results):
profile[k] = v
profile = Profiles.run([hist], dim=dim, htype=None)
profile.update(Profiles.run([hist], dim=dim, htype="all"))
profile.update(Profiles.run([hist], dim=dim, htype="num"))
profile.update(Profiles.run([hist], dim=dim, htype="cat"))

profile.update(Profiles.run([hist], dim=-1, htype=None))
return profile

def _profile_hist(self, split, hist_name):
from popmon.analysis.profiling import Profiles

if len(split) == 0:
self.logger.error(f'Split histograms dict "{hist_name}" is empty. Return.')
return []
Expand All @@ -162,18 +147,18 @@ def _profile_hist(self, split, hist_name):

# these are the profiled quantities we will monitor
expected_fields = (
Profiles.get_keys(dim=dimension, htype=htype)
+ Profiles.get_keys(dim=dimension, htype="all")
+ Profiles.get_keys(dim=dimension, htype=None)
Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=htype)
+ Profiles.get_keys_by_dim_and_htype(dim=dimension, htype="all")
+ Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=None)
)

# profiles regardless of dim and htype (e.g. count)
expected_fields += Profiles.get_keys(dim=None, htype=None)
expected_fields += Profiles.get_keys_by_dim_and_htype(dim=None, htype=None)

# profiles regardless of dim
expected_fields += Profiles.get_keys(dim=-1, htype=htype)
expected_fields += Profiles.get_keys(dim=-1, htype="all")
expected_fields += Profiles.get_keys(dim=-1, htype=None)
expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=htype)
expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype="all")
expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=None)

expected_fields += [self.index_col, self.hist_col]

Expand All @@ -193,7 +178,8 @@ def _profile_hist(self, split, hist_name):

if sorted(profile.keys()) != sorted(expected_fields):
self.logger.error(
f'Could not extract full profile for sub-hist "{hist_name} {index}". Skipping.'
f'Could not extract full profile for sub-hist "{hist_name} {index}".'
f"Differences: {set(profile.keys()).symmetric_difference(set(expected_fields))}. Skipping."
)
else:
profile_list.append(profile)
Expand Down
30 changes: 18 additions & 12 deletions popmon/analysis/profiling/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@

import numpy as np

from popmon.base.registry import Registry

from ...analysis.hist_numpy import get_2dgrid
from ...analysis.profiling.profile_registry import Profiles
from ...hist.hist_utils import sum_entries
from ...stats import numpy as pm_np

Profiles = Registry()


@Profiles.register(
key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"],
Expand All @@ -43,8 +46,10 @@
htype="num",
)
def profile_quantiles(x, w):
return pm_np.quantile(
x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w
return tuple(
pm_np.quantile(
x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w
)
)


Expand All @@ -58,7 +63,9 @@ def profile_std(x, w):
return pm_np.std(x, w)


@Profiles.register(key="nan", description="Number of missing entries (NaN)", dim=1)
@Profiles.register(
key="nan", description="Number of missing entries (NaN)", dim=1, htype=None
)
def profile_nan(hist):
if hasattr(hist, "nanflow"):
return hist.nanflow.entries
Expand All @@ -71,6 +78,7 @@ def profile_nan(hist):
key="overflow",
description="Number of values larger than the maximum bin-edge of the histogram.",
dim=1,
htype=None,
)
def profile_overflow(hist):
if hasattr(hist, "overflow"):
Expand All @@ -82,6 +90,7 @@ def profile_overflow(hist):
key="underflow",
description="Number of values smaller than the minimum bin-edge of the histogram.",
dim=1,
htype=None,
)
def profile_underflow(hist):
if hasattr(hist, "underflow"):
Expand All @@ -93,6 +102,7 @@ def profile_underflow(hist):
key="phik",
description="phi-k correlation between the two variables of the histogram",
dim=2,
htype=None,
)
def profile_phik(hist):
from phik import phik
Expand All @@ -114,7 +124,7 @@ def profile_phik(hist):


@Profiles.register(
key="count", description="Number of entries (non-NaN and NaN)", dim=None
key="count", description="Number of entries (non-NaN and NaN)", dim=-1, htype=None
)
def profile_count(hist):
return int(sum_entries(hist))
Expand All @@ -137,7 +147,8 @@ def profile_distinct(bin_labels, bin_counts):
return len(np.unique(bin_labels[bin_counts > 0]))


def fraction_of_true(bin_labels, bin_entries):
@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat")
def profile_fraction_of_true(bin_labels, bin_counts):
"""Compute fraction of 'true' labels
:param bin_labels: Array containing numbers whose mean is desired. If `a` is not an
Expand All @@ -147,7 +158,7 @@ def fraction_of_true(bin_labels, bin_entries):
:return: fraction of 'true' labels
"""
bin_labels = np.array(bin_labels)
bin_entries = np.array(bin_entries)
bin_entries = np.array(bin_counts)
assert len(bin_labels) == len(bin_entries)

def replace(bl):
Expand Down Expand Up @@ -192,11 +203,6 @@ def replace(bl):
return (1.0 * sum_true) / sum_entries


@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat")
def profile_fraction_of_true(bin_labels, bin_counts):
return fraction_of_true(bin_labels, bin_counts)


@Profiles.register(
key="most_probable_value", description="Most probable value", dim=1, htype="all"
)
Expand Down
Loading

0 comments on commit d01c68a

Please sign in to comment.