From 8e5b39b218b342bb17114caa234d8d5160c37746 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 1 Sep 2022 14:09:02 +0200 Subject: [PATCH] feat(report): histogram inspector See #230 Co-authored-by: pradyot-09 --- popmon/config.py | 9 +- .../notebooks/popmon_tutorial_reports.ipynb | 2 +- popmon/visualization/histogram_section.py | 76 +++-- popmon/visualization/utils.py | 291 ++++++++++++++---- 4 files changed, 291 insertions(+), 87 deletions(-) diff --git a/popmon/config.py b/popmon/config.py index 7375e89d..048def3a 100644 --- a/popmon/config.py +++ b/popmon/config.py @@ -81,9 +81,12 @@ class HistogramSectionModel(SectionModel): name: str = "Histograms" """Name of the histograms section in the report""" - description: str = "Histograms of the last few time slots (default: 2)." + description: str = "This section contains visualisations of individual histograms and heatmaps of them over time." """Description of the histograms section in the report""" + inspector_histogram_choices: int = 2 + """The number of histograms that can be compared at once (e.g. the number of dropdowns)""" + hist_names: List[ Literal["heatmap", "heatmap_column_normalized", "heatmap_row_normalized"] ] = [ @@ -113,8 +116,8 @@ class HistogramSectionModel(SectionModel): } """Descriptions of the heatmaps in the report""" - plot_hist_n: int = 2 - """plot histograms for last 'n' periods. default is 2 (optional)""" + plot_hist_n: int = 0 + """plot histograms for last 'n' periods. default is 0 to show all (optional)""" top_n: int = 20 """plot heatmap for top 'n' categories. default is 20 (optional)""" diff --git a/popmon/notebooks/popmon_tutorial_reports.ipynb b/popmon/notebooks/popmon_tutorial_reports.ipynb index 6ec5dfc3..201d7773 100644 --- a/popmon/notebooks/popmon_tutorial_reports.ipynb +++ b/popmon/notebooks/popmon_tutorial_reports.ipynb @@ -355,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_image(report.datastore[\"report_sections\"][4][\"features\"][0][\"plots\"][\"prev1\"][0])" + "show_image(report.datastore[\"report_sections\"][4][\"features\"][0][\"plots\"][\"ref\"][0])" ] }, { diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index 64aae482..0dcab736 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -33,7 +33,11 @@ from ..base import Module from ..config import HistogramSectionModel from ..utils import parallel, short_date -from ..visualization.utils import plot_heatmap, plot_histogram_overlay +from ..visualization.utils import ( + histogram_basic_checks, + plot_heatmap, + plot_histogram_overlay, +) class HistogramSection(Module): @@ -80,6 +84,7 @@ def __init__( self.hist_names_formatted = settings.hist_names_formatted self.plot_hist_n = settings.plot_hist_n self.top_n = settings.top_n + self.n_choices = settings.inspector_histogram_choices self.cmap = settings.cmap def get_description(self): @@ -92,12 +97,17 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): features = self.get_features(list(data_obj.keys())) features_w_metrics = [] + # Treat these as static + is_static_reference = self.reference_type in ["self", "external"] + self.logger.info(f'Generating section "{self.section_name}".') for feature in tqdm(features, ncols=100): df = data_obj.get(feature, pd.DataFrame()) last_n = ( - len(df.index) if len(df.index) < self.plot_hist_n else self.plot_hist_n + len(df.index) + if (len(df.index) < self.plot_hist_n or self.plot_hist_n == 0) + else self.plot_hist_n ) hist_names = [hn for hn in self.hist_names if hn in df.columns] if len(hist_names) == 0 and len(self.hist_name_starts_with) > 0: @@ -140,15 +150,43 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): (feature, dates[i], hists[i], hist_names, self.top_n) for i in range(last_n) ] + + # get histograms for each timestamp plots = parallel(_plot_histograms, args) plot_type_layouts = {} # filter out potential empty plots - plots = [e for e in plots if len(e["plot"])] - plots = sorted(plots, key=lambda plot: plot["name"]) - if len(plots) > 0: - plot_type_layouts["histogram"] = plots[0]["layout"] + plots = [e for e in plots if len(e)] + plots = sorted(plots, key=lambda plot: plot["date"]) + + # basic checks for histograms + histogram_basic_checks(plots) + + for plot in plots: + for index in range(len(plot["hists"])): + if plot["hist_names"][index] == "histogram_prev1": + del plot["hist_names"][index] + del plot["hists"][index] + break + + # get histogram plots + histogram = {} + if len(plots) > 1: + histogram = plot_histogram_overlay( + plots, + plots[0]["is_num"], + plots[0]["is_ts"], + is_static_reference, + top=self.top_n, + n_choices=self.n_choices, + ) + + if len(histogram) > 0: + plot_type_layouts["histogram"] = histogram["layout"] + histogram = [histogram] + else: + histogram = [] # filter out potential empty heatmap plots, then prepend them to the sorted histograms hplots = [] @@ -160,7 +198,7 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): if len(hplots) > 0: plot_type_layouts["heatmap"] = hplots[0]["layout"] - plots = hplots + plots + plots = hplots + histogram features_w_metrics.append( { @@ -188,7 +226,7 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): :param list hc_list: histogram list :param list hist_names: names of histograms to show as labels :param int max_nbins: maximum number of histogram bins allowed for plot (default 1000) - :return: dict with plotted histogram + :return: dict with histograms for each timestamp """ # basic checks if len(hc_list) != len(hist_names): @@ -206,7 +244,7 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): # make plot. note: slow! if hc_list[0].n_dim == 1: - if all(h.size == 0 for h in hc_list): + if all(h.entries == 0 for h in hc_list): # triviality checks, skip all histograms empty return {"name": date, "description": "", "plot": ""} @@ -245,20 +283,20 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): entries_list = np.reshape(entries_list.ravel(), (-1, len(bins))) hists = [(el, bins) for el in entries_list] - plot = plot_histogram_overlay( - hists, feature, hist_names, y_label, is_num, is_ts - ) + elif hc_list[0].n_dim == 2: - plot = {} + return {} else: - plot = {} + return {} return { - "name": date, - "type": "histogram", - "description": "", - "plot": plot.get("data", ""), - "layout": plot.get("layout", ""), + "date": date, + "hists": hists, + "feature": feature, + "hist_names": hist_names, + "y_label": y_label, + "is_num": is_num, + "is_ts": is_ts, } diff --git a/popmon/visualization/utils.py b/popmon/visualization/utils.py index ab22e26f..28d1732b 100644 --- a/popmon/visualization/utils.py +++ b/popmon/visualization/utils.py @@ -21,6 +21,7 @@ import json import logging import math +import warnings from collections import defaultdict from typing import Dict, List @@ -231,16 +232,61 @@ def plot_traffic_lights_alerts_aggregate( ) +# basic checks for histograms +def histogram_basic_checks(plots={}): + if len(plots) == 0: + return + + for plot in plots: + if len(plot["hist_names"]) == 0: + plot["hist_names"] = [f"hist{i}" for i in range(len(plot["hists"]))] + if plot["hist_names"]: + if len(plot["hists"]) != len(plot["hist_names"]): + raise ValueError("length of hist and hist_names are different") + + for i, hist in enumerate(plot["hists"]): + try: + hist_values, hist_bins = hist + except BaseException as e: + raise ValueError( + "Cannot extract binning and values from input histogram" + ) from e + + assert hist_values is not None and len( + hist_values + ), "Histogram bin values have not been set." + assert hist_bins is not None and len( + hist_bins + ), "Histogram binning has not been set." + + if plot["is_ts"]: + plot["is_num"] = True + + if plot["is_num"]: + bin_edges = hist_bins + bin_values = hist_values + assert ( + len(bin_edges) == len(bin_values) + 1 + ), "bin edges (+ upper edge) and bin values have inconsistent lengths: {:d} vs {:d}. {}".format( + len(bin_edges), len(bin_values), plot["feature"] + ) + else: + labels = hist_bins + values = hist_values + assert len(labels) == len( + values + ), f'labels and values have different array lengths: {len(labels):d} vs {len(values):d}. {plot["feature"]}' + + def plot_histogram_overlay( - hists, - x_label, - hist_names=[], - y_label=None, + plots=[], is_num=True, is_ts=False, + is_static_reference=True, top=20, + n_choices=2, ): - """Create and plot (overlapping) histogram(s) of column values. + """Create and plot (overlapping/grouped) histogram(s) of column values. Copyright Eskapade: Kindly taken from Eskapade package and then modified. Reference link: @@ -248,92 +294,132 @@ def plot_histogram_overlay( License: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE Modifications copyright ING WBAA. - :param list hists: list of input numpy histogram = values, bin_edges - :param str x_label: Label for histogram x-axis - :param list hist_names: list of histogram names. default is []. - :param str y_label: Label for histogram y-axis. default is None. - :param bool is_num: True if observable to plot is numeric. default is True. - :param bool is_ts: True if observable to plot is a timestamp. default is False. + :param list plots: list of dicts containing histograms for all timestamps + :param bool is_num: True if observable to plot is numeric. default is True. + :param bool is_ts: True if observable to plot is a timestamp. default is False. + :param bool is_static_reference: True if the reference is static. default is True :param int top: only print the top 20 characters of x-labels and y-labels. default is 20. + :param int n_choices: number of plots to compare at once :return: JSON encoded plot image :rtype: str """ - # basic checks - if len(hist_names) == 0: - hist_names = [f"hist{i}" for i in range(len(hists))] - if hist_names: - if len(hists) != len(hist_names): - raise ValueError("length of hist and hist_names are different") fig = go.Figure() - alpha = 1.0 / len(hists) - for i, hist in enumerate(hists): - try: - hist_values, hist_bins = hist - except BaseException as e: - raise ValueError( - "Cannot extract binning and values from input histogram" - ) from e - - assert hist_values is not None and len( - hist_values - ), "Histogram bin values have not been set." - assert hist_bins is not None and len( - hist_bins - ), "Histogram binning has not been set." - - # basic attribute check: time stamps treated as numeric. - if is_ts: - is_num = True - - # plot numeric and time stamps - if is_num: - bin_edges = hist_bins - bin_values = hist_values - assert ( - len(bin_edges) == len(bin_values) + 1 - ), "bin edges (+ upper edge) and bin values have inconsistent lengths: {:d} vs {:d}. {}".format( - len(bin_edges), len(bin_values), x_label - ) + alpha = 0.4 + + # check number of plots + if len(plots) < 2: + warnings.warn("insufficient plots for histogram inspection") + return + + base_plot = plots[0] + + # basic attribute check: time stamps treated as numeric. + if is_ts: + is_num = True - # plot histogram + # plot numeric and time stamps + if is_num: + + # plot histogram + for index in range(n_choices): + bin_edges = plots[index]["hists"][0][1] + bin_values = plots[index]["hists"][0][0] fig.add_trace( go.Bar( x=bin_edges[1:], y=bin_values, + opacity=alpha, showlegend=True, + name=plots[index]["date"], + meta=index, + ) + ) + + # plot reference + for index in range(1 if is_static_reference else n_choices): + bin_edges = ( + plots[index]["hists"][0][1] + if len(plots[index]["hists"]) < 2 + else plots[index]["hists"][1][1] + ) + bin_values = ( + [0 for x in range(len(plots[index]["hists"][0][0]))] + if len(plots[index]["hists"]) < 2 + else plots[index]["hists"][1][0] + ) + fig.add_trace( + go.Bar( + x=bin_edges[1:], + y=bin_values, opacity=alpha, - name=hist_names[i], + showlegend=True, + name="no_ref" + if len(plots[index]["hists"]) < 2 + else "Reference" + if is_static_reference + else (plots[index]["date"] + "-") + + plots[index]["hist_names"][1].split("_")[-1], + meta=index + 2, ) ) - # set x-axis properties - xlim = [min(bin_edges), max(bin_edges)] - fig.update_xaxes(range=xlim) + # set x-axis properties + xlim = [min(bin_edges), max(bin_edges)] + fig.update_xaxes(range=xlim) - # plot categories - else: - labels = hist_bins - values = hist_values - assert len(labels) == len( - values - ), f"labels and values have different array lengths: {len(labels):d} vs {len(values):d}. {x_label}" + # plot categories + else: - # plot histogram + # plot histogram for first 'n_choices' timestamps + for index in range(n_choices): + labels = plots[index]["hists"][0][1] + values = plots[index]["hists"][0][0] fig.add_trace( go.Bar( x=[xtick(lab, top) for lab in labels], y=values, + opacity=alpha, showlegend=True, + name=plots[index]["date"], + meta=index, + ) + ) + + # plot reference for first 1 or 'n_choices' timestamps + for index in range(1 if is_static_reference else n_choices): + labels = ( + plots[index]["hists"][0][1] + if len(plots[index]["hists"]) < 2 + else plots[index]["hists"][1][1] + ) + values = ( + [0 for _ in range(len(plots[index]["hists"][0][0]))] + if len(plots[index]["hists"]) < 2 + else plots[index]["hists"][1][0] + ) + fig.add_trace( + go.Bar( + x=[xtick(lab, top) for lab in labels], + y=values, opacity=alpha, - name=hist_names[i], - hovertemplate="%{y:.4f}", + showlegend=True, + name="no_ref" + if len(plots[index]["hists"]) < 2 + else "Reference" + if is_static_reference + else plots[index]["date"] + + " " + + plots[index]["hist_names"][1].split("_")[-1], + meta=index + n_choices, ) ) # set common histogram layout properties - y_label = str(y_label) if y_label is not None else "Bin count" + y_label = ( + str(base_plot["y_label"]) if base_plot["y_label"] is not None else "Bin count" + ) fig.update_yaxes( title=y_label, minor_ticks="outside", @@ -342,7 +428,7 @@ def plot_histogram_overlay( mirror=True, ) fig.update_xaxes( - title=x_label, + title=base_plot["feature"], minor_ticks="outside", showline=True, linecolor="black", @@ -362,8 +448,85 @@ def plot_histogram_overlay( margin={"l": 40, "r": 10}, ) + # dropdown menu + fig.update_layout( + updatemenus=[ + *[ + { + "buttons": [ + { + "label": f'{plot["date"]}', + "method": "restyle", + "args": [ + { + "y": [ + plot["hists"][0][0], + [0 for _ in range(len(plot["hists"][0][0]))] + if len(plot["hists"]) < 2 + else plot["hists"][1][0], + ], + "name": [ + plot["date"], + "no_ref" + if len(plot["hist_names"]) < 2 + else "Reference" + if is_static_reference + else plots[index]["date"] + + " " + + plot["hist_names"][1].split("_")[-1], + ], + }, + [b, b + 2], + ], + } + for plot in plots + ], + "active": b, + "pad": {"r": 10, "t": 10}, + "borderwidth": 0, + "bgcolor": "#d3d3d3", + "showactive": True, + "x": b / 5, + "y": 1.45, + "xanchor": "left", + "yanchor": "top", + } + for b in range(n_choices) + ], + { + "buttons": [ + { + "label": mode, + "method": "relayout", + "args": [ + { + "barmode": mode, + } + ], + } + for mode in ["overlay", "group"] + ], + "pad": {"r": 10, "t": 10}, + "borderwidth": 0, + "bgcolor": "#d3d3d3", + "showactive": True, + "x": 1, + "y": 1.45, + "xanchor": "right", + "yanchor": "top", + }, + ] + ) + plot = json.loads(fig.to_json()) - return plot + return { + "name": "Histogram Inspector ", + "type": "histogram", + "description": "", + "plot": plot.get("data", ""), + "layout": plot.get("layout", ""), + "full_width": True, + } def plot_heatmap(