fix: protection against outliers in sparse histograms (#215)

- Performance speedup in histogram summation o Replace for loop with np.sum - Protection against outliers in sparse histograms o Protect against extreme outliers in sparse histograms by avoiding the use of its dense representation where possible. - Working analysis function unit tests o update of test_report_generator o section_generator now passes np arrays to plot_bars_b64, no longer pd.Series. - Added crop_range option to get_consistent_numpy_1dhists function o return a cropped range version of the histogram, between 5-95% quantiles. o Plot cropped histograms in histogram section of report. - Bump up versions of histogrammar
ing-bank · Jun 5, 2022 · dc37a00 · dc37a00
1 parent 9757ea7
commit dc37a00
Show file tree

Hide file tree

Showing 11 changed files with 86 additions and 164 deletions.
diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py
@@ -34,7 +34,6 @@
 )
 from ...analysis.hist_numpy import (
     check_similar_hists,
-    get_consistent_numpy_1dhists,
     get_consistent_numpy_entries,
     get_consistent_numpy_ndgrids,
 )
@@ -95,9 +94,8 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
     # compare
     if hist1.n_dim == 1:
         if is_numeric(hist1):
-            numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
-            entries_list = [nphist[0] for nphist in numpy_1dhists]
             # KS-test only properly defined for (ordered) 1D interval variables
+            entries_list = get_consistent_numpy_entries([hist1, hist2])
             ks_testscore = ks_test(*entries_list)
             x["ks"] = ks_testscore
             ks_pvalue = ks_prob(ks_testscore)

diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py
@@ -343,12 +343,7 @@ def hist_sum(x, hist_name=""):
     if not similar:
         return pd.Series(o)
 
-    # MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is.
-    # h_sum = np.sum([hist for hist in hist_list])
-
-    h_sum = hist_list[0].zero()
-    for hist in hist_list:
-        h_sum += hist
+    h_sum = np.sum(hist_list)
     o[hist_name] = h_sum
     return pd.Series(o)
 

diff --git a/popmon/analysis/hist_numpy.py b/popmon/analysis/hist_numpy.py
@@ -24,7 +24,8 @@
 import numpy as np
 from histogrammar.util import get_hist_props
 
-from ..hist.hist_utils import is_numeric
+from ..hist.hist_utils import get_bin_centers, is_numeric
+from ..stats.numpy import quantile
 
 used_hist_types = (histogrammar.Bin, histogrammar.SparselyBin, histogrammar.Categorize)
 
@@ -209,12 +210,19 @@ def get_consistent_numpy_2dgrids(hist_list=[], get_bin_labels=False):
     return get_consistent_numpy_ndgrids(hist_list, get_bin_labels, dim=2)
 
 
-def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
-    """Get list of consistent numpy hists for list of sparse input histograms
+def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False, crop_range=False):
+    """Get list of consistent numpy hists for list of sparse (or bin) input histograms
+
+    Works for sparse and bin histograms.
+    Note: for sparse histograms, all potential bins between low and high are picked up (also unfilled).
 
     Note: a numpy histogram is a union of lists of bin_edges and number of entries
+    This gives the full range of bin_centers, including zeros, which is not robust against (extreme) outliers.
+    Ideally, use this for plotting of multiple histograms only.
 
     :param list hist_list: list of input histogram objects
+    :param bool get_bin_labels: return bin labels as well, default is false.
+    :param bool crop_range: return a trimmed version of the histogram, between 5-95% quantiles.
     :return: list of consistent 1d numpy hists for list of sparse input histograms
     """
     # --- basic checks
@@ -229,8 +237,36 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
     high = max(high_arr) if len(high_arr) > 0 else None
     # low == None and/or high == None can only happen when all input hists are empty.
 
+    if crop_range:
+        # crop_range option crops a histogram to reasonable range, e.g. for plotting, giving nice plots.
+        # in particular this protects against outliers that distort the view on the core part of the distribution
+        # range is quantiles 5-95% + 5% on both sides
+        q05_arr = []
+        q95_arr = []
+        for hist in hist_list:
+            bin_centers, values = get_bin_centers(hist)
+            bin_entries = np.array([v.entries for v in values])
+            qs = quantile(bin_centers, [0.05, 0.95], bin_entries)
+            q05_arr.append(qs[0])
+            q95_arr.append(qs[1])
+        q05 = min(q05_arr) if len(q05_arr) > 0 else np.nan
+        q95 = max(q95_arr) if len(q95_arr) > 0 else np.nan
+        delta = q95 - q05
+        var_min = q05 - (0.06 / 0.9) * delta
+        var_max = q95 + (0.06 / 0.9) * delta
+        if 0.0 < var_min < 0.2 * delta:
+            var_min = 0.0
+        elif -0.2 * delta < var_max < 0.0:
+            var_max = 0.0
+        if not np.isnan(var_min) and low is not None and var_min > low:
+            low = var_min
+        if not np.isnan(var_max) and high is not None and var_max < high:
+            high = var_max
+
     # if one of the input histograms is sparse and empty, copy the bin-edges and bin-centers
     # from a filled histogram, and use empty bin-entries array
+    # MB 20220601: note this gives the full range of bin_centers, which is not robust against (extreme) outliers
+    #              get_consistent_numpy_entries() ignores all empty bins.
     bin_edges = [0.0, 1.0]
     bin_centers = [0.5]
     null_entries = [0.0]
@@ -239,7 +275,7 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
             if hist.low is not None and hist.high is not None:
                 bin_edges = hist.bin_edges(low, high)
                 bin_centers = hist.bin_centers(low, high)
-                null_entries = [0] * len(bin_centers)
+                null_entries = np.zeros(len(bin_centers))
                 break
 
     nphist_list = []
@@ -260,6 +296,10 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
 def get_consistent_numpy_entries(hist_list, get_bin_labels=False):
     """Get list of consistent numpy bin_entries for list of 1d input histograms
 
+    Works for categorize, sparse and bin histograms.
+    Note: for sparse histograms, *only* the filled bins are picked up.
+    (this is not the case when calling get_consistent_numpy_1dhists(), which takes all bins b/n low and high.)
+
     :param list hist_list: list of input histogrammar histograms
     :return: list of consistent 1d numpy arrays with bin_entries for list of input histograms
     """
@@ -281,7 +321,7 @@ def get_consistent_numpy_entries(hist_list, get_bin_labels=False):
     # union of all labels encountered
     labels = set()
     for hist in hist_list:
-        bin_labels = hist.bin_centers() if all_num else hist.keySet
+        bin_labels = get_bin_centers(hist)[0]
         labels = labels.union(bin_labels)
     labels = sorted(labels)
 
@@ -294,7 +334,6 @@ def get_consistent_numpy_entries(hist_list, get_bin_labels=False):
         props = get_hist_props(hist_list[0])
         if props["is_bool"]:
             cat_labels = [lab == "True" for lab in cat_labels]
-
         kwargs = {"labels": cat_labels}
 
     entries_list = [hist.bin_entries(**kwargs) for hist in hist_list]

diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py
@@ -207,8 +207,8 @@ def _profile_1d_histogram(self, name, hist):
         is_num = is_numeric(hist)
         is_ts = is_timestamp(hist) or name in self.var_timestamp
 
-        bin_labels = np.array(get_bin_centers(hist)[0])
-        bin_counts = np.array([v.entries for v in get_bin_centers(hist)[1]])
+        bin_labels, values = get_bin_centers(hist)
+        bin_counts = np.array([v.entries for v in values])
 
         if len(bin_counts) == 0:
             self.logger.warning(f'Histogram "{name}" is empty; skipping.')

diff --git a/popmon/hist/hist_utils.py b/popmon/hist/hist_utils.py
@@ -220,13 +220,15 @@ def is_numeric(hist):
 
 def sparse_bin_centers_x(hist):
     """Get x-axis bin centers of sparse histogram"""
-    keys = sorted(hist.bins.keys())
+    # note: want sorted keys for plotting
+    keys = np.array(sorted(hist.bins.keys()))
     if hist.minBin is None or hist.maxBin is None:
         # number of bins is set to 1.
         centers = np.array([hist.origin + 0.5 * hist.binWidth])
     else:
-        centers = np.array([hist.origin + (i + 0.5) * hist.binWidth for i in keys])
-
+        # default for filled histogram
+        centers = hist.origin + (keys + 0.5) * hist.binWidth
+    # note: so values is also sorted
     values = [hist.bins[key] for key in keys]
     return centers, values
 

diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py
@@ -167,13 +167,14 @@ def transform(self, data_obj: dict, sections: Optional[list] = None):
         return sections
 
 
-def _plot_histograms(feature, date, hc_list, hist_names, top_n):
+def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000):
     """Split off plot histogram generation to allow for parallel processing
 
     :param str feature: feature
     :param str date: date of time slot
     :param list hc_list: histogram list
     :param list hist_names: names of histograms to show as labels
+    :param int max_nbins: maximum number of histogram bins allowed for plot (default 1000)
     :return: dict with plotted histogram
     """
     # basic checks
@@ -187,18 +188,22 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n):
     hist_names = [hn for i, hn in enumerate(hist_names) if i not in none_hists]
     # more basic checks
     if len(hc_list) == 0:
-        return date, ""
+        return {"name": date, "description": get_stat_description(date), "plot": ""}
     assert_similar_hists(hc_list)
 
     # make plot. note: slow!
     if hc_list[0].n_dim == 1:
+        if all(h.size == 0 for h in hc_list):
+            # triviality checks, skip all histograms empty
+            return {"name": date, "description": get_stat_description(date), "plot": ""}
+
         props = get_hist_props(hc_list[0])
         is_num = props["is_num"]
         is_ts = props["is_ts"]
         y_label = "Bin count" if len(hc_list) == 1 else "Bin probability"
 
         if is_num:
-            numpy_1dhists = get_consistent_numpy_1dhists(hc_list)
+            numpy_1dhists = get_consistent_numpy_1dhists(hc_list, crop_range=True)
             entries_list = [nphist[0] for nphist in numpy_1dhists]
             bins = numpy_1dhists[0][1]  # bins = bin-edges
         else:
@@ -207,9 +212,9 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n):
                 hc_list, get_bin_labels=True
             )  # bins = bin-labels
 
-        if len(bins) == 0:
-            # skip empty histograms
-            return date, ""
+        # skip histograms with too many bins to plot (default more than 1000)
+        if len(bins) > max_nbins:
+            return {"name": date, "description": get_stat_description(date), "plot": ""}
 
         # normalize histograms for plotting (comparison!) in case there is more than one.
         if len(hc_list) >= 2:

diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py
@@ -132,7 +132,7 @@ def transform(
                 inplace=True,
                 errors="ignore",
             )
-            dates = [short_date(str(date)) for date in df.index.tolist()]
+            dates = np.array([short_date(str(date)) for date in df.index.tolist()])
 
             metrics = filter_metrics(
                 df.columns, self.ignore_stat_endswith, self.show_stats
@@ -143,7 +143,7 @@ def transform(
                     feature,
                     metric,
                     dates,
-                    df[metric],
+                    df[metric].values,
                     static_bounds,
                     fdbounds,
                     self.prefix,
@@ -201,13 +201,14 @@ def _plot_metric(
     )
     # choose dynamic bounds if present
     bounds = dbounds if len(dbounds) > 0 else sbounds
+
     # prune dates and values
     dates = _prune(dates, last_n, skip_first_n, skip_last_n)
     values = _prune(values, last_n, skip_first_n, skip_last_n)
 
     # make plot. note: slow!
     plot = plot_bars_b64(
-        data=np.array(values),
+        data=values,
         labels=dates,
         ylim=True,
         bounds=bounds,

diff --git a/popmon/visualization/utils.py b/popmon/visualization/utils.py
@@ -78,7 +78,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True):
     """
     # basic checks first
     n = data.size  # number of bins
-    if labels and len(labels) != n:
+    if labels is not None and len(labels) != n:
         raise ValueError("shape mismatch: x-axis labels do not match the data shape")
 
     # skip plot generation for empty datasets
@@ -97,7 +97,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True):
     width = (index[1] - index[0]) * 0.9 if n >= 2 else 1.0
     ax.bar(index, data, width=width, align="center")
 
-    if labels:
+    if labels is not None:
         ax.set_xticks(index)
         ax.set_xticklabels(labels, fontdict={"rotation": "vertical"})
         granularity = math.ceil(len(labels) / 50)

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 numpy>=1.18.0
 pandas>=0.25.1
 scipy>=1.5.2
-histogrammar>=1.0.27
+histogrammar>=1.0.28
 phik
 jinja2
 tqdm

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import find_packages, setup
 
-__version__ = "0.6.1"
+__version__ = "0.6.2"
 
 with open("requirements.txt") as f:
     REQUIREMENTS = f.read().splitlines()