From da731c47d3cef4b5874fca9b9d450f00d086a03b Mon Sep 17 00:00:00 2001
From: Simon Brugman <sfbbrugman@gmail.com>
Date: Thu, 1 Sep 2022 14:09:02 +0200
Subject: [PATCH] feat(report): histogram inspector

See #230
---
 popmon/config.py                              |   9 +-
 .../notebooks/popmon_tutorial_reports.ipynb   |   2 +-
 popmon/visualization/histogram_section.py     |  76 +++--
 popmon/visualization/utils.py                 | 291 ++++++++++++++----
 4 files changed, 291 insertions(+), 87 deletions(-)

diff --git a/popmon/config.py b/popmon/config.py
index 7375e89d..048def3a 100644
--- a/popmon/config.py
+++ b/popmon/config.py
@@ -81,9 +81,12 @@ class HistogramSectionModel(SectionModel):
     name: str = "Histograms"
     """Name of the histograms section in the report"""
 
-    description: str = "Histograms of the last few time slots (default: 2)."
+    description: str = "This section contains visualisations of individual histograms and heatmaps of them over time."
     """Description of the histograms section in the report"""
 
+    inspector_histogram_choices: int = 2
+    """The number of histograms that can be compared at once (e.g. the number of dropdowns)"""
+
     hist_names: List[
         Literal["heatmap", "heatmap_column_normalized", "heatmap_row_normalized"]
     ] = [
@@ -113,8 +116,8 @@ class HistogramSectionModel(SectionModel):
     }
     """Descriptions of the heatmaps in the report"""
 
-    plot_hist_n: int = 2
-    """plot histograms for last 'n' periods. default is 2 (optional)"""
+    plot_hist_n: int = 0
+    """plot histograms for last 'n' periods. default is 0 to show all (optional)"""
 
     top_n: int = 20
     """plot heatmap for top 'n' categories. default is 20 (optional)"""
diff --git a/popmon/notebooks/popmon_tutorial_reports.ipynb b/popmon/notebooks/popmon_tutorial_reports.ipynb
index 6ec5dfc3..201d7773 100644
--- a/popmon/notebooks/popmon_tutorial_reports.ipynb
+++ b/popmon/notebooks/popmon_tutorial_reports.ipynb
@@ -355,7 +355,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "show_image(report.datastore[\"report_sections\"][4][\"features\"][0][\"plots\"][\"prev1\"][0])"
+    "show_image(report.datastore[\"report_sections\"][4][\"features\"][0][\"plots\"][\"ref\"][0])"
    ]
   },
   {
diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py
index 64aae482..0dcab736 100644
--- a/popmon/visualization/histogram_section.py
+++ b/popmon/visualization/histogram_section.py
@@ -33,7 +33,11 @@
 from ..base import Module
 from ..config import HistogramSectionModel
 from ..utils import parallel, short_date
-from ..visualization.utils import plot_heatmap, plot_histogram_overlay
+from ..visualization.utils import (
+    histogram_basic_checks,
+    plot_heatmap,
+    plot_histogram_overlay,
+)
 
 
 class HistogramSection(Module):
@@ -80,6 +84,7 @@ def __init__(
         self.hist_names_formatted = settings.hist_names_formatted
         self.plot_hist_n = settings.plot_hist_n
         self.top_n = settings.top_n
+        self.n_choices = settings.inspector_histogram_choices
         self.cmap = settings.cmap
 
     def get_description(self):
@@ -92,12 +97,17 @@ def transform(self, data_obj: dict, sections: Optional[list] = None):
         features = self.get_features(list(data_obj.keys()))
         features_w_metrics = []
 
+        # Treat these as static
+        is_static_reference = self.reference_type in ["self", "external"]
+
         self.logger.info(f'Generating section "{self.section_name}".')
 
         for feature in tqdm(features, ncols=100):
             df = data_obj.get(feature, pd.DataFrame())
             last_n = (
-                len(df.index) if len(df.index) < self.plot_hist_n else self.plot_hist_n
+                len(df.index)
+                if (len(df.index) < self.plot_hist_n or self.plot_hist_n == 0)
+                else self.plot_hist_n
             )
             hist_names = [hn for hn in self.hist_names if hn in df.columns]
             if len(hist_names) == 0 and len(self.hist_name_starts_with) > 0:
@@ -140,15 +150,43 @@ def transform(self, data_obj: dict, sections: Optional[list] = None):
                 (feature, dates[i], hists[i], hist_names, self.top_n)
                 for i in range(last_n)
             ]
+
+            # get histograms for each timestamp
             plots = parallel(_plot_histograms, args)
 
             plot_type_layouts = {}
 
             # filter out potential empty plots
-            plots = [e for e in plots if len(e["plot"])]
-            plots = sorted(plots, key=lambda plot: plot["name"])
-            if len(plots) > 0:
-                plot_type_layouts["histogram"] = plots[0]["layout"]
+            plots = [e for e in plots if len(e)]
+            plots = sorted(plots, key=lambda plot: plot["date"])
+
+            # basic checks for histograms
+            histogram_basic_checks(plots)
+
+            for plot in plots:
+                for index in range(len(plot["hists"])):
+                    if plot["hist_names"][index] == "histogram_prev1":
+                        del plot["hist_names"][index]
+                        del plot["hists"][index]
+                        break
+
+            # get histogram plots
+            histogram = {}
+            if len(plots) > 1:
+                histogram = plot_histogram_overlay(
+                    plots,
+                    plots[0]["is_num"],
+                    plots[0]["is_ts"],
+                    is_static_reference,
+                    top=self.top_n,
+                    n_choices=self.n_choices,
+                )
+
+            if len(histogram) > 0:
+                plot_type_layouts["histogram"] = histogram["layout"]
+                histogram = [histogram]
+            else:
+                histogram = []
 
             # filter out potential empty heatmap plots, then prepend them to the sorted histograms
             hplots = []
@@ -160,7 +198,7 @@ def transform(self, data_obj: dict, sections: Optional[list] = None):
             if len(hplots) > 0:
                 plot_type_layouts["heatmap"] = hplots[0]["layout"]
 
-            plots = hplots + plots
+            plots = hplots + histogram
 
             features_w_metrics.append(
                 {
@@ -188,7 +226,7 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000):
     :param list hc_list: histogram list
     :param list hist_names: names of histograms to show as labels
     :param int max_nbins: maximum number of histogram bins allowed for plot (default 1000)
-    :return: dict with plotted histogram
+    :return: dict with histograms for each timestamp
     """
     # basic checks
     if len(hc_list) != len(hist_names):
@@ -206,7 +244,7 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000):
 
     # make plot. note: slow!
     if hc_list[0].n_dim == 1:
-        if all(h.size == 0 for h in hc_list):
+        if all(h.entries == 0 for h in hc_list):
             # triviality checks, skip all histograms empty
             return {"name": date, "description": "", "plot": ""}
 
@@ -245,20 +283,20 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000):
                 entries_list = np.reshape(entries_list.ravel(), (-1, len(bins)))
 
         hists = [(el, bins) for el in entries_list]
-        plot = plot_histogram_overlay(
-            hists, feature, hist_names, y_label, is_num, is_ts
-        )
+
     elif hc_list[0].n_dim == 2:
-        plot = {}
+        return {}
     else:
-        plot = {}
+        return {}
 
     return {
-        "name": date,
-        "type": "histogram",
-        "description": "",
-        "plot": plot.get("data", ""),
-        "layout": plot.get("layout", ""),
+        "date": date,
+        "hists": hists,
+        "feature": feature,
+        "hist_names": hist_names,
+        "y_label": y_label,
+        "is_num": is_num,
+        "is_ts": is_ts,
     }
 
 
diff --git a/popmon/visualization/utils.py b/popmon/visualization/utils.py
index ab22e26f..28d1732b 100644
--- a/popmon/visualization/utils.py
+++ b/popmon/visualization/utils.py
@@ -21,6 +21,7 @@
 import json
 import logging
 import math
+import warnings
 from collections import defaultdict
 from typing import Dict, List
 
@@ -231,16 +232,61 @@ def plot_traffic_lights_alerts_aggregate(
     )
 
 
+# basic checks for histograms
+def histogram_basic_checks(plots={}):
+    if len(plots) == 0:
+        return
+
+    for plot in plots:
+        if len(plot["hist_names"]) == 0:
+            plot["hist_names"] = [f"hist{i}" for i in range(len(plot["hists"]))]
+        if plot["hist_names"]:
+            if len(plot["hists"]) != len(plot["hist_names"]):
+                raise ValueError("length of hist and hist_names are different")
+
+        for i, hist in enumerate(plot["hists"]):
+            try:
+                hist_values, hist_bins = hist
+            except BaseException as e:
+                raise ValueError(
+                    "Cannot extract binning and values from input histogram"
+                ) from e
+
+            assert hist_values is not None and len(
+                hist_values
+            ), "Histogram bin values have not been set."
+            assert hist_bins is not None and len(
+                hist_bins
+            ), "Histogram binning has not been set."
+
+            if plot["is_ts"]:
+                plot["is_num"] = True
+
+            if plot["is_num"]:
+                bin_edges = hist_bins
+                bin_values = hist_values
+                assert (
+                    len(bin_edges) == len(bin_values) + 1
+                ), "bin edges (+ upper edge) and bin values have inconsistent lengths: {:d} vs {:d}. {}".format(
+                    len(bin_edges), len(bin_values), plot["feature"]
+                )
+            else:
+                labels = hist_bins
+                values = hist_values
+                assert len(labels) == len(
+                    values
+                ), f'labels and values have different array lengths: {len(labels):d} vs {len(values):d}. {plot["feature"]}'
+
+
 def plot_histogram_overlay(
-    hists,
-    x_label,
-    hist_names=[],
-    y_label=None,
+    plots=[],
     is_num=True,
     is_ts=False,
+    is_static_reference=True,
     top=20,
+    n_choices=2,
 ):
-    """Create and plot (overlapping) histogram(s) of column values.
+    """Create and plot (overlapping/grouped) histogram(s) of column values.
 
     Copyright Eskapade:
     Kindly taken from Eskapade package and then modified. Reference link:
@@ -248,92 +294,132 @@ def plot_histogram_overlay(
     License: https://github.com/KaveIO/Eskapade-Core/blob/master/LICENSE
     Modifications copyright ING WBAA.
 
-    :param list hists: list of input numpy histogram = values, bin_edges
-    :param str x_label: Label for histogram x-axis
-    :param list hist_names: list of histogram names. default is [].
-    :param str y_label: Label for histogram y-axis. default is None.
-    :param bool is_num: True if observable to plot is numeric. default is True.
-    :param bool is_ts: True if observable to plot is a timestamp. default is False.
+    :param list plots: list of dicts containing histograms for all timestamps
+        :param bool is_num: True if observable to plot is numeric. default is True.
+        :param bool is_ts: True if observable to plot is a timestamp. default is False.
+    :param bool is_static_reference: True if the reference is static. default is True
     :param int top: only print the top 20 characters of x-labels and y-labels. default is 20.
+    :param int n_choices: number of plots to compare at once
     :return: JSON encoded plot image
     :rtype: str
     """
-    # basic checks
-    if len(hist_names) == 0:
-        hist_names = [f"hist{i}" for i in range(len(hists))]
-    if hist_names:
-        if len(hists) != len(hist_names):
-            raise ValueError("length of hist and hist_names are different")
 
     fig = go.Figure()
 
-    alpha = 1.0 / len(hists)
-    for i, hist in enumerate(hists):
-        try:
-            hist_values, hist_bins = hist
-        except BaseException as e:
-            raise ValueError(
-                "Cannot extract binning and values from input histogram"
-            ) from e
-
-        assert hist_values is not None and len(
-            hist_values
-        ), "Histogram bin values have not been set."
-        assert hist_bins is not None and len(
-            hist_bins
-        ), "Histogram binning has not been set."
-
-        # basic attribute check: time stamps treated as numeric.
-        if is_ts:
-            is_num = True
-
-        # plot numeric and time stamps
-        if is_num:
-            bin_edges = hist_bins
-            bin_values = hist_values
-            assert (
-                len(bin_edges) == len(bin_values) + 1
-            ), "bin edges (+ upper edge) and bin values have inconsistent lengths: {:d} vs {:d}. {}".format(
-                len(bin_edges), len(bin_values), x_label
-            )
+    alpha = 0.4
+
+    # check number of plots
+    if len(plots) < 2:
+        warnings.warn("insufficient plots for histogram inspection")
+        return
+
+    base_plot = plots[0]
+
+    # basic attribute check: time stamps treated as numeric.
+    if is_ts:
+        is_num = True
 
-            # plot histogram
+    # plot numeric and time stamps
+    if is_num:
+
+        # plot histogram
+        for index in range(n_choices):
+            bin_edges = plots[index]["hists"][0][1]
+            bin_values = plots[index]["hists"][0][0]
             fig.add_trace(
                 go.Bar(
                     x=bin_edges[1:],
                     y=bin_values,
+                    opacity=alpha,
                     showlegend=True,
+                    name=plots[index]["date"],
+                    meta=index,
+                )
+            )
+
+        # plot reference
+        for index in range(1 if is_static_reference else n_choices):
+            bin_edges = (
+                plots[index]["hists"][0][1]
+                if len(plots[index]["hists"]) < 2
+                else plots[index]["hists"][1][1]
+            )
+            bin_values = (
+                [0 for x in range(len(plots[index]["hists"][0][0]))]
+                if len(plots[index]["hists"]) < 2
+                else plots[index]["hists"][1][0]
+            )
+            fig.add_trace(
+                go.Bar(
+                    x=bin_edges[1:],
+                    y=bin_values,
                     opacity=alpha,
-                    name=hist_names[i],
+                    showlegend=True,
+                    name="no_ref"
+                    if len(plots[index]["hists"]) < 2
+                    else "Reference"
+                    if is_static_reference
+                    else (plots[index]["date"] + "-")
+                    + plots[index]["hist_names"][1].split("_")[-1],
+                    meta=index + 2,
                 )
             )
 
-            # set x-axis properties
-            xlim = [min(bin_edges), max(bin_edges)]
-            fig.update_xaxes(range=xlim)
+        # set x-axis properties
+        xlim = [min(bin_edges), max(bin_edges)]
+        fig.update_xaxes(range=xlim)
 
-        # plot categories
-        else:
-            labels = hist_bins
-            values = hist_values
-            assert len(labels) == len(
-                values
-            ), f"labels and values have different array lengths: {len(labels):d} vs {len(values):d}. {x_label}"
+    # plot categories
+    else:
 
-            # plot histogram
+        # plot histogram for first 'n_choices' timestamps
+        for index in range(n_choices):
+            labels = plots[index]["hists"][0][1]
+            values = plots[index]["hists"][0][0]
             fig.add_trace(
                 go.Bar(
                     x=[xtick(lab, top) for lab in labels],
                     y=values,
+                    opacity=alpha,
                     showlegend=True,
+                    name=plots[index]["date"],
+                    meta=index,
+                )
+            )
+
+        # plot reference for first 1 or 'n_choices' timestamps
+        for index in range(1 if is_static_reference else n_choices):
+            labels = (
+                plots[index]["hists"][0][1]
+                if len(plots[index]["hists"]) < 2
+                else plots[index]["hists"][1][1]
+            )
+            values = (
+                [0 for _ in range(len(plots[index]["hists"][0][0]))]
+                if len(plots[index]["hists"]) < 2
+                else plots[index]["hists"][1][0]
+            )
+            fig.add_trace(
+                go.Bar(
+                    x=[xtick(lab, top) for lab in labels],
+                    y=values,
                     opacity=alpha,
-                    name=hist_names[i],
-                    hovertemplate="%{y:.4f}",
+                    showlegend=True,
+                    name="no_ref"
+                    if len(plots[index]["hists"]) < 2
+                    else "Reference"
+                    if is_static_reference
+                    else plots[index]["date"]
+                    + " "
+                    + plots[index]["hist_names"][1].split("_")[-1],
+                    meta=index + n_choices,
                 )
             )
 
     # set common histogram layout properties
-    y_label = str(y_label) if y_label is not None else "Bin count"
+    y_label = (
+        str(base_plot["y_label"]) if base_plot["y_label"] is not None else "Bin count"
+    )
     fig.update_yaxes(
         title=y_label,
         minor_ticks="outside",
@@ -342,7 +428,7 @@ def plot_histogram_overlay(
         mirror=True,
     )
     fig.update_xaxes(
-        title=x_label,
+        title=base_plot["feature"],
         minor_ticks="outside",
         showline=True,
         linecolor="black",
@@ -362,8 +448,85 @@ def plot_histogram_overlay(
         margin={"l": 40, "r": 10},
     )
 
+    # dropdown menu
+    fig.update_layout(
+        updatemenus=[
+            *[
+                {
+                    "buttons": [
+                        {
+                            "label": f'{plot["date"]}',
+                            "method": "restyle",
+                            "args": [
+                                {
+                                    "y": [
+                                        plot["hists"][0][0],
+                                        [0 for _ in range(len(plot["hists"][0][0]))]
+                                        if len(plot["hists"]) < 2
+                                        else plot["hists"][1][0],
+                                    ],
+                                    "name": [
+                                        plot["date"],
+                                        "no_ref"
+                                        if len(plot["hist_names"]) < 2
+                                        else "Reference"
+                                        if is_static_reference
+                                        else plots[index]["date"]
+                                        + " "
+                                        + plot["hist_names"][1].split("_")[-1],
+                                    ],
+                                },
+                                [b, b + 2],
+                            ],
+                        }
+                        for plot in plots
+                    ],
+                    "active": b,
+                    "pad": {"r": 10, "t": 10},
+                    "borderwidth": 0,
+                    "bgcolor": "#d3d3d3",
+                    "showactive": True,
+                    "x": b / 5,
+                    "y": 1.45,
+                    "xanchor": "left",
+                    "yanchor": "top",
+                }
+                for b in range(n_choices)
+            ],
+            {
+                "buttons": [
+                    {
+                        "label": mode,
+                        "method": "relayout",
+                        "args": [
+                            {
+                                "barmode": mode,
+                            }
+                        ],
+                    }
+                    for mode in ["overlay", "group"]
+                ],
+                "pad": {"r": 10, "t": 10},
+                "borderwidth": 0,
+                "bgcolor": "#d3d3d3",
+                "showactive": True,
+                "x": 1,
+                "y": 1.45,
+                "xanchor": "right",
+                "yanchor": "top",
+            },
+        ]
+    )
+
     plot = json.loads(fig.to_json())
-    return plot
+    return {
+        "name": "Histogram Inspector ",
+        "type": "histogram",
+        "description": "",
+        "plot": plot.get("data", ""),
+        "layout": plot.get("layout", ""),
+        "full_width": True,
+    }
 
 
 def plot_heatmap(