From dc37a007a2f3925bf92ee8bf31a54aaf28fb11e1 Mon Sep 17 00:00:00 2001 From: Max Baak Date: Sun, 5 Jun 2022 22:18:30 +0200 Subject: [PATCH] fix: protection against outliers in sparse histograms (#215) - Performance speedup in histogram summation o Replace for loop with np.sum - Protection against outliers in sparse histograms o Protect against extreme outliers in sparse histograms by avoiding the use of its dense representation where possible. - Working analysis function unit tests o update of test_report_generator o section_generator now passes np arrays to plot_bars_b64, no longer pd.Series. - Added crop_range option to get_consistent_numpy_1dhists function o return a cropped range version of the histogram, between 5-95% quantiles. o Plot cropped histograms in histogram section of report. - Bump up versions of histogrammar --- popmon/analysis/comparison/hist_comparer.py | 4 +- popmon/analysis/functions.py | 7 +- popmon/analysis/hist_numpy.py | 51 ++++++- popmon/analysis/profiling/hist_profiler.py | 4 +- popmon/hist/hist_utils.py | 8 +- popmon/visualization/histogram_section.py | 17 ++- popmon/visualization/section_generator.py | 7 +- popmon/visualization/utils.py | 4 +- requirements.txt | 2 +- setup.py | 2 +- tests/popmon/analysis/test_functions.py | 144 ++------------------ 11 files changed, 86 insertions(+), 164 deletions(-) diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 8734c726..63f22dd7 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -34,7 +34,6 @@ ) from ...analysis.hist_numpy import ( check_similar_hists, - get_consistent_numpy_1dhists, get_consistent_numpy_entries, get_consistent_numpy_ndgrids, ) @@ -95,9 +94,8 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): # compare if hist1.n_dim == 1: if is_numeric(hist1): - numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2]) - entries_list = [nphist[0] for nphist in numpy_1dhists] # KS-test only properly defined for (ordered) 1D interval variables + entries_list = get_consistent_numpy_entries([hist1, hist2]) ks_testscore = ks_test(*entries_list) x["ks"] = ks_testscore ks_pvalue = ks_prob(ks_testscore) diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py index d7db7be2..6ee92ba7 100644 --- a/popmon/analysis/functions.py +++ b/popmon/analysis/functions.py @@ -343,12 +343,7 @@ def hist_sum(x, hist_name=""): if not similar: return pd.Series(o) - # MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is. - # h_sum = np.sum([hist for hist in hist_list]) - - h_sum = hist_list[0].zero() - for hist in hist_list: - h_sum += hist + h_sum = np.sum(hist_list) o[hist_name] = h_sum return pd.Series(o) diff --git a/popmon/analysis/hist_numpy.py b/popmon/analysis/hist_numpy.py index 1f36f0a9..31967330 100644 --- a/popmon/analysis/hist_numpy.py +++ b/popmon/analysis/hist_numpy.py @@ -24,7 +24,8 @@ import numpy as np from histogrammar.util import get_hist_props -from ..hist.hist_utils import is_numeric +from ..hist.hist_utils import get_bin_centers, is_numeric +from ..stats.numpy import quantile used_hist_types = (histogrammar.Bin, histogrammar.SparselyBin, histogrammar.Categorize) @@ -209,12 +210,19 @@ def get_consistent_numpy_2dgrids(hist_list=[], get_bin_labels=False): return get_consistent_numpy_ndgrids(hist_list, get_bin_labels, dim=2) -def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False): - """Get list of consistent numpy hists for list of sparse input histograms +def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False, crop_range=False): + """Get list of consistent numpy hists for list of sparse (or bin) input histograms + + Works for sparse and bin histograms. + Note: for sparse histograms, all potential bins between low and high are picked up (also unfilled). Note: a numpy histogram is a union of lists of bin_edges and number of entries + This gives the full range of bin_centers, including zeros, which is not robust against (extreme) outliers. + Ideally, use this for plotting of multiple histograms only. :param list hist_list: list of input histogram objects + :param bool get_bin_labels: return bin labels as well, default is false. + :param bool crop_range: return a trimmed version of the histogram, between 5-95% quantiles. :return: list of consistent 1d numpy hists for list of sparse input histograms """ # --- basic checks @@ -229,8 +237,36 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False): high = max(high_arr) if len(high_arr) > 0 else None # low == None and/or high == None can only happen when all input hists are empty. + if crop_range: + # crop_range option crops a histogram to reasonable range, e.g. for plotting, giving nice plots. + # in particular this protects against outliers that distort the view on the core part of the distribution + # range is quantiles 5-95% + 5% on both sides + q05_arr = [] + q95_arr = [] + for hist in hist_list: + bin_centers, values = get_bin_centers(hist) + bin_entries = np.array([v.entries for v in values]) + qs = quantile(bin_centers, [0.05, 0.95], bin_entries) + q05_arr.append(qs[0]) + q95_arr.append(qs[1]) + q05 = min(q05_arr) if len(q05_arr) > 0 else np.nan + q95 = max(q95_arr) if len(q95_arr) > 0 else np.nan + delta = q95 - q05 + var_min = q05 - (0.06 / 0.9) * delta + var_max = q95 + (0.06 / 0.9) * delta + if 0.0 < var_min < 0.2 * delta: + var_min = 0.0 + elif -0.2 * delta < var_max < 0.0: + var_max = 0.0 + if not np.isnan(var_min) and low is not None and var_min > low: + low = var_min + if not np.isnan(var_max) and high is not None and var_max < high: + high = var_max + # if one of the input histograms is sparse and empty, copy the bin-edges and bin-centers # from a filled histogram, and use empty bin-entries array + # MB 20220601: note this gives the full range of bin_centers, which is not robust against (extreme) outliers + # get_consistent_numpy_entries() ignores all empty bins. bin_edges = [0.0, 1.0] bin_centers = [0.5] null_entries = [0.0] @@ -239,7 +275,7 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False): if hist.low is not None and hist.high is not None: bin_edges = hist.bin_edges(low, high) bin_centers = hist.bin_centers(low, high) - null_entries = [0] * len(bin_centers) + null_entries = np.zeros(len(bin_centers)) break nphist_list = [] @@ -260,6 +296,10 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False): def get_consistent_numpy_entries(hist_list, get_bin_labels=False): """Get list of consistent numpy bin_entries for list of 1d input histograms + Works for categorize, sparse and bin histograms. + Note: for sparse histograms, *only* the filled bins are picked up. + (this is not the case when calling get_consistent_numpy_1dhists(), which takes all bins b/n low and high.) + :param list hist_list: list of input histogrammar histograms :return: list of consistent 1d numpy arrays with bin_entries for list of input histograms """ @@ -281,7 +321,7 @@ def get_consistent_numpy_entries(hist_list, get_bin_labels=False): # union of all labels encountered labels = set() for hist in hist_list: - bin_labels = hist.bin_centers() if all_num else hist.keySet + bin_labels = get_bin_centers(hist)[0] labels = labels.union(bin_labels) labels = sorted(labels) @@ -294,7 +334,6 @@ def get_consistent_numpy_entries(hist_list, get_bin_labels=False): props = get_hist_props(hist_list[0]) if props["is_bool"]: cat_labels = [lab == "True" for lab in cat_labels] - kwargs = {"labels": cat_labels} entries_list = [hist.bin_entries(**kwargs) for hist in hist_list] diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index e76fb195..4d5cfb03 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -207,8 +207,8 @@ def _profile_1d_histogram(self, name, hist): is_num = is_numeric(hist) is_ts = is_timestamp(hist) or name in self.var_timestamp - bin_labels = np.array(get_bin_centers(hist)[0]) - bin_counts = np.array([v.entries for v in get_bin_centers(hist)[1]]) + bin_labels, values = get_bin_centers(hist) + bin_counts = np.array([v.entries for v in values]) if len(bin_counts) == 0: self.logger.warning(f'Histogram "{name}" is empty; skipping.') diff --git a/popmon/hist/hist_utils.py b/popmon/hist/hist_utils.py index 4cd9c463..6a6141b6 100644 --- a/popmon/hist/hist_utils.py +++ b/popmon/hist/hist_utils.py @@ -220,13 +220,15 @@ def is_numeric(hist): def sparse_bin_centers_x(hist): """Get x-axis bin centers of sparse histogram""" - keys = sorted(hist.bins.keys()) + # note: want sorted keys for plotting + keys = np.array(sorted(hist.bins.keys())) if hist.minBin is None or hist.maxBin is None: # number of bins is set to 1. centers = np.array([hist.origin + 0.5 * hist.binWidth]) else: - centers = np.array([hist.origin + (i + 0.5) * hist.binWidth for i in keys]) - + # default for filled histogram + centers = hist.origin + (keys + 0.5) * hist.binWidth + # note: so values is also sorted values = [hist.bins[key] for key in keys] return centers, values diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index aedcb385..bfa41871 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -167,13 +167,14 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): return sections -def _plot_histograms(feature, date, hc_list, hist_names, top_n): +def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): """Split off plot histogram generation to allow for parallel processing :param str feature: feature :param str date: date of time slot :param list hc_list: histogram list :param list hist_names: names of histograms to show as labels + :param int max_nbins: maximum number of histogram bins allowed for plot (default 1000) :return: dict with plotted histogram """ # basic checks @@ -187,18 +188,22 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n): hist_names = [hn for i, hn in enumerate(hist_names) if i not in none_hists] # more basic checks if len(hc_list) == 0: - return date, "" + return {"name": date, "description": get_stat_description(date), "plot": ""} assert_similar_hists(hc_list) # make plot. note: slow! if hc_list[0].n_dim == 1: + if all(h.size == 0 for h in hc_list): + # triviality checks, skip all histograms empty + return {"name": date, "description": get_stat_description(date), "plot": ""} + props = get_hist_props(hc_list[0]) is_num = props["is_num"] is_ts = props["is_ts"] y_label = "Bin count" if len(hc_list) == 1 else "Bin probability" if is_num: - numpy_1dhists = get_consistent_numpy_1dhists(hc_list) + numpy_1dhists = get_consistent_numpy_1dhists(hc_list, crop_range=True) entries_list = [nphist[0] for nphist in numpy_1dhists] bins = numpy_1dhists[0][1] # bins = bin-edges else: @@ -207,9 +212,9 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n): hc_list, get_bin_labels=True ) # bins = bin-labels - if len(bins) == 0: - # skip empty histograms - return date, "" + # skip histograms with too many bins to plot (default more than 1000) + if len(bins) > max_nbins: + return {"name": date, "description": get_stat_description(date), "plot": ""} # normalize histograms for plotting (comparison!) in case there is more than one. if len(hc_list) >= 2: diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index 8dd24487..f06fbc10 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -132,7 +132,7 @@ def transform( inplace=True, errors="ignore", ) - dates = [short_date(str(date)) for date in df.index.tolist()] + dates = np.array([short_date(str(date)) for date in df.index.tolist()]) metrics = filter_metrics( df.columns, self.ignore_stat_endswith, self.show_stats @@ -143,7 +143,7 @@ def transform( feature, metric, dates, - df[metric], + df[metric].values, static_bounds, fdbounds, self.prefix, @@ -201,13 +201,14 @@ def _plot_metric( ) # choose dynamic bounds if present bounds = dbounds if len(dbounds) > 0 else sbounds + # prune dates and values dates = _prune(dates, last_n, skip_first_n, skip_last_n) values = _prune(values, last_n, skip_first_n, skip_last_n) # make plot. note: slow! plot = plot_bars_b64( - data=np.array(values), + data=values, labels=dates, ylim=True, bounds=bounds, diff --git a/popmon/visualization/utils.py b/popmon/visualization/utils.py index d4414367..ceb6d6e1 100644 --- a/popmon/visualization/utils.py +++ b/popmon/visualization/utils.py @@ -78,7 +78,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True): """ # basic checks first n = data.size # number of bins - if labels and len(labels) != n: + if labels is not None and len(labels) != n: raise ValueError("shape mismatch: x-axis labels do not match the data shape") # skip plot generation for empty datasets @@ -97,7 +97,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True): width = (index[1] - index[0]) * 0.9 if n >= 2 else 1.0 ax.bar(index, data, width=width, align="center") - if labels: + if labels is not None: ax.set_xticks(index) ax.set_xticklabels(labels, fontdict={"rotation": "vertical"}) granularity = math.ceil(len(labels) / 50) diff --git a/requirements.txt b/requirements.txt index 4514b353..0da1e9a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.18.0 pandas>=0.25.1 scipy>=1.5.2 -histogrammar>=1.0.27 +histogrammar>=1.0.28 phik jinja2 tqdm diff --git a/setup.py b/setup.py index e27a90ec..4478f116 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import find_packages, setup -__version__ = "0.6.1" +__version__ = "0.6.2" with open("requirements.txt") as f: REQUIREMENTS = f.read().splitlines() diff --git a/tests/popmon/analysis/test_functions.py b/tests/popmon/analysis/test_functions.py index 7640228a..53e5950f 100644 --- a/tests/popmon/analysis/test_functions.py +++ b/tests/popmon/analysis/test_functions.py @@ -425,67 +425,6 @@ def test_roll_norm_hist_mean_cov(): [ 0.8, 0.1, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, 0.1, ] ) @@ -545,66 +484,9 @@ def test_expand_norm_hist_mean_cov(): 0.56666667, 0.03333333, 0.03333333, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, 0.06666667, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, 0.06666667, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, 0.06666667, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, 0.03333333, 0.06666667, 0.06666667, @@ -665,15 +547,15 @@ def test_chi_squared1(): assert f in datastore["output_hist"] df = datastore["output_hist"]["A_score"] - np.testing.assert_almost_equal(df["chi2"][6], 4.25) + np.testing.assert_almost_equal(df["chi2"].values[6], 4.25) df = datastore["output_hist"]["A_score:num_employees"] - np.testing.assert_almost_equal(df["chi2"][-2], 2.1333333333333315) + np.testing.assert_almost_equal(df["chi2"].values[-2], 2.1333333333333315) df = datastore["output_hist"]["bankrupt"] - np.testing.assert_almost_equal(df["chi2"][6], 0.40000000000000024) + np.testing.assert_almost_equal(df["chi2"].values[6], 0.40000000000000024) df = datastore["output_hist"]["country"] - np.testing.assert_almost_equal(df["chi2"][5], 0.8999999999999994) + np.testing.assert_almost_equal(df["chi2"].values[5], 0.8999999999999994) df = datastore["output_hist"]["num_employees"] - np.testing.assert_almost_equal(df["chi2"][5], 0.849999999999999) + np.testing.assert_almost_equal(df["chi2"].values[5], 0.849999999999999) @pytest.mark.filterwarnings("ignore:invalid value encountered in true_divide") @@ -726,15 +608,15 @@ def test_chi_squared2(): assert f in datastore["output_hist"] df = datastore["output_hist"]["A_score"] - np.testing.assert_almost_equal(df["chi2"][-1], 9.891821919006366) + np.testing.assert_almost_equal(df["chi2"].values[-1], 9.891821919006366) df = datastore["output_hist"]["A_score:num_employees"] - np.testing.assert_almost_equal(df["chi2"][-2], 3.217532467532462) + np.testing.assert_almost_equal(df["chi2"].values[-2], 3.217532467532462) df = datastore["output_hist"]["bankrupt"] - np.testing.assert_almost_equal(df["chi2"][-1], 0.23767605633802757) + np.testing.assert_almost_equal(df["chi2"].values[-1], 0.23767605633802757) df = datastore["output_hist"]["country"] - np.testing.assert_almost_equal(df["chi2"][-1], 1.3717532467532458) + np.testing.assert_almost_equal(df["chi2"].values[-1], 1.3717532467532458) df = datastore["output_hist"]["num_employees"] - np.testing.assert_almost_equal(df["chi2"][-1], 1.1858766233766194) + np.testing.assert_almost_equal(df["chi2"].values[-1], 1.1858766233766194) def test_chi_ReferenceNormHistComparer(): @@ -775,7 +657,7 @@ def test_chi_ReferenceNormHistComparer(): assert f in datastore["comparisons"] df = datastore["comparisons"]["A_score"] - np.testing.assert_almost_equal(df["chi2"][0], 2.2884111855886022) + np.testing.assert_almost_equal(df["chi2"].values[0], 2.2884111855886022) def test_chi_ExpandingNormHistComparer(): @@ -812,7 +694,7 @@ def test_chi_ExpandingNormHistComparer(): assert f in datastore["comparisons"] df = datastore["comparisons"]["A_score"] - np.testing.assert_almost_equal(df["chi2"][-1], 9.891821919006366) + np.testing.assert_almost_equal(df["chi2"].values[-1], 9.891821919006366) def test_chi_RollingNormHistComparer(): @@ -851,4 +733,4 @@ def test_chi_RollingNormHistComparer(): assert f in datastore["comparisons"] df = datastore["comparisons"]["A_score"] - np.testing.assert_almost_equal(df["chi2"][-1], 37.61910112359518) + np.testing.assert_almost_equal(df["chi2"].values[-1], 37.61910112359518)