Skip to content

Commit

Permalink
fix: protection against outliers in sparse histograms (#215)
Browse files Browse the repository at this point in the history
- Performance speedup in histogram summation
  o Replace for loop with np.sum
- Protection against outliers in sparse histograms
  o Protect against extreme outliers in sparse histograms by avoiding the use
    of its dense representation where possible.
- Working analysis function unit tests
  o update of test_report_generator
  o section_generator now passes np arrays to plot_bars_b64, no longer pd.Series.
- Added crop_range option to get_consistent_numpy_1dhists function
  o return a cropped range version of the histogram, between 5-95% quantiles.
  o Plot cropped histograms in histogram section of report.
- Bump up versions of histogrammar
  • Loading branch information
mbaak authored Jun 5, 2022
1 parent 9757ea7 commit dc37a00
Show file tree
Hide file tree
Showing 11 changed files with 86 additions and 164 deletions.
4 changes: 1 addition & 3 deletions popmon/analysis/comparison/hist_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
)
from ...analysis.hist_numpy import (
check_similar_hists,
get_consistent_numpy_1dhists,
get_consistent_numpy_entries,
get_consistent_numpy_ndgrids,
)
Expand Down Expand Up @@ -95,9 +94,8 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0):
# compare
if hist1.n_dim == 1:
if is_numeric(hist1):
numpy_1dhists = get_consistent_numpy_1dhists([hist1, hist2])
entries_list = [nphist[0] for nphist in numpy_1dhists]
# KS-test only properly defined for (ordered) 1D interval variables
entries_list = get_consistent_numpy_entries([hist1, hist2])
ks_testscore = ks_test(*entries_list)
x["ks"] = ks_testscore
ks_pvalue = ks_prob(ks_testscore)
Expand Down
7 changes: 1 addition & 6 deletions popmon/analysis/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,7 @@ def hist_sum(x, hist_name=""):
if not similar:
return pd.Series(o)

# MB FIX: h_sum not initialized correctly in a sum by histogrammar for sparselybin (origin); below it is.
# h_sum = np.sum([hist for hist in hist_list])

h_sum = hist_list[0].zero()
for hist in hist_list:
h_sum += hist
h_sum = np.sum(hist_list)
o[hist_name] = h_sum
return pd.Series(o)

Expand Down
51 changes: 45 additions & 6 deletions popmon/analysis/hist_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
import numpy as np
from histogrammar.util import get_hist_props

from ..hist.hist_utils import is_numeric
from ..hist.hist_utils import get_bin_centers, is_numeric
from ..stats.numpy import quantile

used_hist_types = (histogrammar.Bin, histogrammar.SparselyBin, histogrammar.Categorize)

Expand Down Expand Up @@ -209,12 +210,19 @@ def get_consistent_numpy_2dgrids(hist_list=[], get_bin_labels=False):
return get_consistent_numpy_ndgrids(hist_list, get_bin_labels, dim=2)


def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
"""Get list of consistent numpy hists for list of sparse input histograms
def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False, crop_range=False):
"""Get list of consistent numpy hists for list of sparse (or bin) input histograms
Works for sparse and bin histograms.
Note: for sparse histograms, all potential bins between low and high are picked up (also unfilled).
Note: a numpy histogram is a union of lists of bin_edges and number of entries
This gives the full range of bin_centers, including zeros, which is not robust against (extreme) outliers.
Ideally, use this for plotting of multiple histograms only.
:param list hist_list: list of input histogram objects
:param bool get_bin_labels: return bin labels as well, default is false.
:param bool crop_range: return a trimmed version of the histogram, between 5-95% quantiles.
:return: list of consistent 1d numpy hists for list of sparse input histograms
"""
# --- basic checks
Expand All @@ -229,8 +237,36 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
high = max(high_arr) if len(high_arr) > 0 else None
# low == None and/or high == None can only happen when all input hists are empty.

if crop_range:
# crop_range option crops a histogram to reasonable range, e.g. for plotting, giving nice plots.
# in particular this protects against outliers that distort the view on the core part of the distribution
# range is quantiles 5-95% + 5% on both sides
q05_arr = []
q95_arr = []
for hist in hist_list:
bin_centers, values = get_bin_centers(hist)
bin_entries = np.array([v.entries for v in values])
qs = quantile(bin_centers, [0.05, 0.95], bin_entries)
q05_arr.append(qs[0])
q95_arr.append(qs[1])
q05 = min(q05_arr) if len(q05_arr) > 0 else np.nan
q95 = max(q95_arr) if len(q95_arr) > 0 else np.nan
delta = q95 - q05
var_min = q05 - (0.06 / 0.9) * delta
var_max = q95 + (0.06 / 0.9) * delta
if 0.0 < var_min < 0.2 * delta:
var_min = 0.0
elif -0.2 * delta < var_max < 0.0:
var_max = 0.0
if not np.isnan(var_min) and low is not None and var_min > low:
low = var_min
if not np.isnan(var_max) and high is not None and var_max < high:
high = var_max

# if one of the input histograms is sparse and empty, copy the bin-edges and bin-centers
# from a filled histogram, and use empty bin-entries array
# MB 20220601: note this gives the full range of bin_centers, which is not robust against (extreme) outliers
# get_consistent_numpy_entries() ignores all empty bins.
bin_edges = [0.0, 1.0]
bin_centers = [0.5]
null_entries = [0.0]
Expand All @@ -239,7 +275,7 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
if hist.low is not None and hist.high is not None:
bin_edges = hist.bin_edges(low, high)
bin_centers = hist.bin_centers(low, high)
null_entries = [0] * len(bin_centers)
null_entries = np.zeros(len(bin_centers))
break

nphist_list = []
Expand All @@ -260,6 +296,10 @@ def get_consistent_numpy_1dhists(hist_list, get_bin_labels=False):
def get_consistent_numpy_entries(hist_list, get_bin_labels=False):
"""Get list of consistent numpy bin_entries for list of 1d input histograms
Works for categorize, sparse and bin histograms.
Note: for sparse histograms, *only* the filled bins are picked up.
(this is not the case when calling get_consistent_numpy_1dhists(), which takes all bins b/n low and high.)
:param list hist_list: list of input histogrammar histograms
:return: list of consistent 1d numpy arrays with bin_entries for list of input histograms
"""
Expand All @@ -281,7 +321,7 @@ def get_consistent_numpy_entries(hist_list, get_bin_labels=False):
# union of all labels encountered
labels = set()
for hist in hist_list:
bin_labels = hist.bin_centers() if all_num else hist.keySet
bin_labels = get_bin_centers(hist)[0]
labels = labels.union(bin_labels)
labels = sorted(labels)

Expand All @@ -294,7 +334,6 @@ def get_consistent_numpy_entries(hist_list, get_bin_labels=False):
props = get_hist_props(hist_list[0])
if props["is_bool"]:
cat_labels = [lab == "True" for lab in cat_labels]

kwargs = {"labels": cat_labels}

entries_list = [hist.bin_entries(**kwargs) for hist in hist_list]
Expand Down
4 changes: 2 additions & 2 deletions popmon/analysis/profiling/hist_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ def _profile_1d_histogram(self, name, hist):
is_num = is_numeric(hist)
is_ts = is_timestamp(hist) or name in self.var_timestamp

bin_labels = np.array(get_bin_centers(hist)[0])
bin_counts = np.array([v.entries for v in get_bin_centers(hist)[1]])
bin_labels, values = get_bin_centers(hist)
bin_counts = np.array([v.entries for v in values])

if len(bin_counts) == 0:
self.logger.warning(f'Histogram "{name}" is empty; skipping.')
Expand Down
8 changes: 5 additions & 3 deletions popmon/hist/hist_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,15 @@ def is_numeric(hist):

def sparse_bin_centers_x(hist):
"""Get x-axis bin centers of sparse histogram"""
keys = sorted(hist.bins.keys())
# note: want sorted keys for plotting
keys = np.array(sorted(hist.bins.keys()))
if hist.minBin is None or hist.maxBin is None:
# number of bins is set to 1.
centers = np.array([hist.origin + 0.5 * hist.binWidth])
else:
centers = np.array([hist.origin + (i + 0.5) * hist.binWidth for i in keys])

# default for filled histogram
centers = hist.origin + (keys + 0.5) * hist.binWidth
# note: so values is also sorted
values = [hist.bins[key] for key in keys]
return centers, values

Expand Down
17 changes: 11 additions & 6 deletions popmon/visualization/histogram_section.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,14 @@ def transform(self, data_obj: dict, sections: Optional[list] = None):
return sections


def _plot_histograms(feature, date, hc_list, hist_names, top_n):
def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000):
"""Split off plot histogram generation to allow for parallel processing
:param str feature: feature
:param str date: date of time slot
:param list hc_list: histogram list
:param list hist_names: names of histograms to show as labels
:param int max_nbins: maximum number of histogram bins allowed for plot (default 1000)
:return: dict with plotted histogram
"""
# basic checks
Expand All @@ -187,18 +188,22 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n):
hist_names = [hn for i, hn in enumerate(hist_names) if i not in none_hists]
# more basic checks
if len(hc_list) == 0:
return date, ""
return {"name": date, "description": get_stat_description(date), "plot": ""}
assert_similar_hists(hc_list)

# make plot. note: slow!
if hc_list[0].n_dim == 1:
if all(h.size == 0 for h in hc_list):
# triviality checks, skip all histograms empty
return {"name": date, "description": get_stat_description(date), "plot": ""}

props = get_hist_props(hc_list[0])
is_num = props["is_num"]
is_ts = props["is_ts"]
y_label = "Bin count" if len(hc_list) == 1 else "Bin probability"

if is_num:
numpy_1dhists = get_consistent_numpy_1dhists(hc_list)
numpy_1dhists = get_consistent_numpy_1dhists(hc_list, crop_range=True)
entries_list = [nphist[0] for nphist in numpy_1dhists]
bins = numpy_1dhists[0][1] # bins = bin-edges
else:
Expand All @@ -207,9 +212,9 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n):
hc_list, get_bin_labels=True
) # bins = bin-labels

if len(bins) == 0:
# skip empty histograms
return date, ""
# skip histograms with too many bins to plot (default more than 1000)
if len(bins) > max_nbins:
return {"name": date, "description": get_stat_description(date), "plot": ""}

# normalize histograms for plotting (comparison!) in case there is more than one.
if len(hc_list) >= 2:
Expand Down
7 changes: 4 additions & 3 deletions popmon/visualization/section_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def transform(
inplace=True,
errors="ignore",
)
dates = [short_date(str(date)) for date in df.index.tolist()]
dates = np.array([short_date(str(date)) for date in df.index.tolist()])

metrics = filter_metrics(
df.columns, self.ignore_stat_endswith, self.show_stats
Expand All @@ -143,7 +143,7 @@ def transform(
feature,
metric,
dates,
df[metric],
df[metric].values,
static_bounds,
fdbounds,
self.prefix,
Expand Down Expand Up @@ -201,13 +201,14 @@ def _plot_metric(
)
# choose dynamic bounds if present
bounds = dbounds if len(dbounds) > 0 else sbounds

# prune dates and values
dates = _prune(dates, last_n, skip_first_n, skip_last_n)
values = _prune(values, last_n, skip_first_n, skip_last_n)

# make plot. note: slow!
plot = plot_bars_b64(
data=np.array(values),
data=values,
labels=dates,
ylim=True,
bounds=bounds,
Expand Down
4 changes: 2 additions & 2 deletions popmon/visualization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True):
"""
# basic checks first
n = data.size # number of bins
if labels and len(labels) != n:
if labels is not None and len(labels) != n:
raise ValueError("shape mismatch: x-axis labels do not match the data shape")

# skip plot generation for empty datasets
Expand All @@ -97,7 +97,7 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True):
width = (index[1] - index[0]) * 0.9 if n >= 2 else 1.0
ax.bar(index, data, width=width, align="center")

if labels:
if labels is not None:
ax.set_xticks(index)
ax.set_xticklabels(labels, fontdict={"rotation": "vertical"})
granularity = math.ceil(len(labels) / 50)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.18.0
pandas>=0.25.1
scipy>=1.5.2
histogrammar>=1.0.27
histogrammar>=1.0.28
phik
jinja2
tqdm
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import find_packages, setup

__version__ = "0.6.1"
__version__ = "0.6.2"

with open("requirements.txt") as f:
REQUIREMENTS = f.read().splitlines()
Expand Down
Loading

0 comments on commit dc37a00

Please sign in to comment.