diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 16d363d0..df93445e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,3 +26,17 @@ repos: rev: v1.12.1 hooks: - id: blacken-docs +- repo: local + hooks: + - id: docautogenerate + name: docsautogenerate + entry: bash -c 'cd ./docs/ && bash autogenerate.sh' + language: system +- repo: https://github.com/nbQA-dev/nbQA + rev: 1.3.1 + hooks: + - id: nbqa-black + - id: nbqa-pyupgrade + args: ['--py36-plus'] + - id: nbqa-isort + args: ['--profile=black'] \ No newline at end of file diff --git a/docs/source/comparisons.rst b/docs/source/comparisons.rst index c810ec08..e3cc6c07 100644 --- a/docs/source/comparisons.rst +++ b/docs/source/comparisons.rst @@ -35,3 +35,19 @@ The code below demonstrates how this could be achieved: return np.sum(np.abs(p - q)) If you developed a custom comparison that could be generically used, then please considering contributing it to the package. + +Comparison settings +------------------- + +Whenever a comparison has parameters, it is possible to alter them globally: + +.. code-block:: python + + from functools import partial + + from popmon.analysis.comparison.comparison_registry import Comparisons + + # Set the max_res_bound to 5 (default 7) for the chi2 comparison function + Comparisons.update_func( + "chi2", partial(Comparisons.get_func("chi2"), max_res_bound=5.0) + ) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index cd2aedb0..78c0bfbb 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -30,15 +30,22 @@ in four different ways: .. code-block:: python + settings = Settings() + settings.comparison.window = 10 + settings.comparison.shift = 1 + # generate stability report with specific monitoring rules - report = df.pm_stability_report(reference_type="rolling", window=10, shift=1) + report = df.pm_stability_report(reference_type="rolling", settings=settings) 4. Using an expanding window on all preceding time slots within the same DataFrame. This is also a dynamic method, with variable window size. All the available previous time slots are used. For example, if we have 2 time slots available and shift=1, window size will be 1 (so the previous slot is the reference), while if we have 10 time slots and shift=1, window size will be 9 (and all previous time slots are reference). .. code-block:: python + settings = Settings() + settings.comparison.shift = 1 + # generate stability report with specific monitoring rules - report = df.pm_stability_report(reference_type="expanding", shift=1) + report = df.pm_stability_report(reference_type="expanding", settings=settings) Note that, by default, popmon also performs a rolling comparison of the histograms in each time period with those in the previous time period. The results of these comparisons contain the term "prev1", and are found in the comparisons section @@ -111,8 +118,11 @@ When generating a report, they can be provided as a dictionary: .. code-block:: python + settings = Settings() + settings.monitoring.monitoring_rules = your_monitoring_rules + # generate stability report with specific monitoring rules - report = df.pm_stability_report(monitoring_rules=your_monitoring_rules) + report = df.pm_stability_report(settings=settings) When not provided, the default setting is: @@ -168,8 +178,11 @@ When generating a report, the ``pull_rules`` can be provided as a dictionary: .. code-block:: python + settings = Settings() + settings.monitoring.pull_rules = your_pull_rules + # generate stability report with specific monitoring rules - report = df.pm_stability_report(pull_rules=your_pull_rules) + report = df.pm_stability_report(settings=settings) The default for `pull_rules` is: diff --git a/docs/source/index.rst b/docs/source/index.rst index 3a1b7cf6..e05d870e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -18,7 +18,6 @@ Contents comparisons tutorials configuration - tree developing changelog diff --git a/docs/source/popmon.alerting.rst b/docs/source/popmon.alerting.rst index 6b782f1e..4f328813 100644 --- a/docs/source/popmon.alerting.rst +++ b/docs/source/popmon.alerting.rst @@ -20,7 +20,6 @@ popmon.alerting.compute\_tl\_bounds module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.analysis.comparison.rst b/docs/source/popmon.analysis.comparison.rst index 214562e7..079d52fb 100644 --- a/docs/source/popmon.analysis.comparison.rst +++ b/docs/source/popmon.analysis.comparison.rst @@ -4,6 +4,14 @@ popmon.analysis.comparison package Submodules ---------- +popmon.analysis.comparison.comparisons module +--------------------------------------------- + +.. automodule:: popmon.analysis.comparison.comparisons + :members: + :undoc-members: + :show-inheritance: + popmon.analysis.comparison.hist\_comparer module ------------------------------------------------ @@ -12,7 +20,6 @@ popmon.analysis.comparison.hist\_comparer module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.analysis.profiling.rst b/docs/source/popmon.analysis.profiling.rst index dd4e08c8..e5dac456 100644 --- a/docs/source/popmon.analysis.profiling.rst +++ b/docs/source/popmon.analysis.profiling.rst @@ -12,6 +12,14 @@ popmon.analysis.profiling.hist\_profiler module :undoc-members: :show-inheritance: +popmon.analysis.profiling.profiles module +----------------------------------------- + +.. automodule:: popmon.analysis.profiling.profiles + :members: + :undoc-members: + :show-inheritance: + popmon.analysis.profiling.pull\_calculator module ------------------------------------------------- @@ -20,7 +28,6 @@ popmon.analysis.profiling.pull\_calculator module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.analysis.rst b/docs/source/popmon.analysis.rst index 1d2d3f34..e65a4751 100644 --- a/docs/source/popmon.analysis.rst +++ b/docs/source/popmon.analysis.rst @@ -5,6 +5,7 @@ Subpackages ----------- .. toctree:: + :maxdepth: 4 popmon.analysis.comparison popmon.analysis.profiling @@ -44,7 +45,6 @@ popmon.analysis.merge\_statistics module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.base.rst b/docs/source/popmon.base.rst index dbec0961..7255015e 100644 --- a/docs/source/popmon.base.rst +++ b/docs/source/popmon.base.rst @@ -20,6 +20,13 @@ popmon.base.pipeline module :undoc-members: :show-inheritance: +popmon.base.registry module +--------------------------- + +.. automodule:: popmon.base.registry + :members: + :undoc-members: + :show-inheritance: Module contents --------------- diff --git a/docs/source/popmon.decorators.rst b/docs/source/popmon.decorators.rst new file mode 100644 index 00000000..adfdb5cf --- /dev/null +++ b/docs/source/popmon.decorators.rst @@ -0,0 +1,29 @@ +popmon.decorators package +========================= + +Submodules +---------- + +popmon.decorators.pandas module +------------------------------- + +.. automodule:: popmon.decorators.pandas + :members: + :undoc-members: + :show-inheritance: + +popmon.decorators.spark module +------------------------------ + +.. automodule:: popmon.decorators.spark + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: popmon.decorators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/popmon.hist.filling.rst b/docs/source/popmon.hist.filling.rst index f9ee6f28..84f9a030 100644 --- a/docs/source/popmon.hist.filling.rst +++ b/docs/source/popmon.hist.filling.rst @@ -1,59 +1,6 @@ popmon.hist.filling package =========================== -Submodules ----------- - -popmon.hist.filling.histogram\_filler\_base module --------------------------------------------------- - -.. automodule:: popmon.hist.filling.histogram_filler_base - :members: - :undoc-members: - :show-inheritance: - -popmon.hist.filling.make\_histograms module -------------------------------------------- - -.. automodule:: popmon.hist.filling.make_histograms - :members: - :undoc-members: - :show-inheritance: - :noindex: - -popmon.hist.filling.numpy\_histogrammar module ----------------------------------------------- - -.. automodule:: popmon.hist.filling.numpy_histogrammar - :members: - :undoc-members: - :show-inheritance: - -popmon.hist.filling.pandas\_histogrammar module ------------------------------------------------ - -.. automodule:: popmon.hist.filling.pandas_histogrammar - :members: - :undoc-members: - :show-inheritance: - -popmon.hist.filling.spark\_histogrammar module ----------------------------------------------- - -.. automodule:: popmon.hist.filling.spark_histogrammar - :members: - :undoc-members: - :show-inheritance: - -popmon.hist.filling.utils module --------------------------------- - -.. automodule:: popmon.hist.filling.utils - :members: - :undoc-members: - :show-inheritance: - - Module contents --------------- diff --git a/docs/source/popmon.hist.rst b/docs/source/popmon.hist.rst index b8e1dd6c..c601328c 100644 --- a/docs/source/popmon.hist.rst +++ b/docs/source/popmon.hist.rst @@ -5,6 +5,7 @@ Subpackages ----------- .. toctree:: + :maxdepth: 4 popmon.hist.filling @@ -19,23 +20,14 @@ popmon.hist.hist\_splitter module :undoc-members: :show-inheritance: -popmon.hist.histogram module ----------------------------- +popmon.hist.hist\_utils module +------------------------------ -.. automodule:: popmon.hist.histogram +.. automodule:: popmon.hist.hist_utils :members: :undoc-members: :show-inheritance: -popmon.hist.patched\_histogrammer module ----------------------------------------- - -.. automodule:: popmon.hist.patched_histogrammer - :members: - :undoc-members: - :show-inheritance: - - Module contents --------------- diff --git a/docs/source/popmon.io.rst b/docs/source/popmon.io.rst index eadae921..a1a42ab0 100644 --- a/docs/source/popmon.io.rst +++ b/docs/source/popmon.io.rst @@ -28,7 +28,6 @@ popmon.io.json\_reader module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.pipeline.rst b/docs/source/popmon.pipeline.rst index 6359282e..d294e08c 100644 --- a/docs/source/popmon.pipeline.rst +++ b/docs/source/popmon.pipeline.rst @@ -44,7 +44,6 @@ popmon.pipeline.report\_pipelines module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.rst b/docs/source/popmon.rst index 714ecfff..471aaf3c 100644 --- a/docs/source/popmon.rst +++ b/docs/source/popmon.rst @@ -5,10 +5,12 @@ Subpackages ----------- .. toctree:: + :maxdepth: 4 popmon.alerting popmon.analysis popmon.base + popmon.decorators popmon.hist popmon.io popmon.pipeline @@ -35,6 +37,14 @@ popmon.resources module :undoc-members: :show-inheritance: +popmon.utils module +------------------- + +.. automodule:: popmon.utils + :members: + :undoc-members: + :show-inheritance: + popmon.version module --------------------- @@ -43,7 +53,6 @@ popmon.version module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.stats.rst b/docs/source/popmon.stats.rst index 91fc4371..fd8b5567 100644 --- a/docs/source/popmon.stats.rst +++ b/docs/source/popmon.stats.rst @@ -12,7 +12,6 @@ popmon.stats.numpy module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.stitching.rst b/docs/source/popmon.stitching.rst index fe815c89..99f2a271 100644 --- a/docs/source/popmon.stitching.rst +++ b/docs/source/popmon.stitching.rst @@ -12,7 +12,6 @@ popmon.stitching.hist\_stitcher module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon.visualization.rst b/docs/source/popmon.visualization.rst index ff4077e5..b006ff3e 100644 --- a/docs/source/popmon.visualization.rst +++ b/docs/source/popmon.visualization.rst @@ -4,6 +4,14 @@ popmon.visualization package Submodules ---------- +popmon.visualization.alert\_section\_generator module +----------------------------------------------------- + +.. automodule:: popmon.visualization.alert_section_generator + :members: + :undoc-members: + :show-inheritance: + popmon.visualization.backend module ----------------------------------- @@ -20,6 +28,14 @@ popmon.visualization.histogram\_section module :undoc-members: :show-inheritance: +popmon.visualization.overview\_section module +--------------------------------------------- + +.. automodule:: popmon.visualization.overview_section + :members: + :undoc-members: + :show-inheritance: + popmon.visualization.report\_generator module --------------------------------------------- @@ -36,6 +52,14 @@ popmon.visualization.section\_generator module :undoc-members: :show-inheritance: +popmon.visualization.traffic\_light\_section\_generator module +-------------------------------------------------------------- + +.. automodule:: popmon.visualization.traffic_light_section_generator + :members: + :undoc-members: + :show-inheritance: + popmon.visualization.utils module --------------------------------- @@ -44,7 +68,6 @@ popmon.visualization.utils module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/popmon_index.rst b/docs/source/popmon_index.rst index 8b089dd2..47ebae48 100644 --- a/docs/source/popmon_index.rst +++ b/docs/source/popmon_index.rst @@ -1,4 +1,4 @@ -popmon +POPMON ====== .. toctree:: diff --git a/docs/source/tree.rst b/docs/source/tree.rst deleted file mode 100644 index effe0ca6..00000000 --- a/docs/source/tree.rst +++ /dev/null @@ -1,91 +0,0 @@ -====================== -Package tree structure -====================== - - -.. code-block:: text - - . - ├── alerting/ - │ ├── alerts_summary.py - │ └── compute_tl_bounds.py - ├── analysis/ - │ ├── apply_func.py - │ ├── comparison/ - │ │ └── hist_comparer.py - │ ├── functions.py - │ ├── hist_numpy.py - │ ├── merge_statistics.py - │ └── profiling/ - │ ├── hist_profiler.py - │ └── pull_calculator.py - ├── base/ - │ ├── module.py - │ └── pipeline.py - ├── config.py - ├── decorators/ - │ ├── pandas.py - │ └── spark.py - ├── hist/ - │ ├── filling/ - │ │ ├── histogram_filler_base.py - │ │ ├── make_histograms.py - │ │ ├── numpy_histogrammar.py - │ │ ├── pandas_histogrammar.py - │ │ ├── spark_histogrammar.py - │ │ └── utils.py - │ ├── hist_splitter.py - │ ├── histogram.py - │ └── patched_histogrammer.py - ├── io/ - │ ├── file_reader.py - │ ├── file_writer.py - │ └── json_reader.py - ├── notebooks/ - │ ├── flight_delays.csv.gz - │ ├── flight_delays_reference.csv.gz - │ ├── popmon_tutorial_advanced.ipynb - │ ├── popmon_tutorial_basic.ipynb - │ └── popmon_tutorial_incremental_data.ipynb - ├── pipeline/ - │ ├── amazing_pipeline.py - │ ├── metrics.py - │ ├── metrics_pipelines.py - │ ├── report.py - │ └── report_pipelines.py - ├── resources.py - ├── stats/ - │ └── numpy.py - ├── stitching/ - │ └── hist_stitcher.py - ├── test_data/ - │ ├── data_generator_hists.json.gz - │ ├── example.json - │ ├── example_histogram.json - │ ├── synthetic_histograms.json - │ └── test.csv.gz - ├── version.py - └── visualization/ - ├── backend.py - ├── histogram_section.py - ├── report_generator.py - ├── section_generator.py - ├── templates/ - │ ├── assets/ - │ │ ├── css/ - │ │ │ ├── bootstrap.min.css - │ │ │ └── custom-style.css - │ │ └── js/ - │ │ ├── bootstrap.bundle.min.js - │ │ ├── custom-script.js - │ │ ├── jquery.easing.min.js - │ │ ├── jquery.min.js - │ │ └── scrolling-nav.js - │ ├── card.html - │ ├── core.html - │ ├── footer.html - │ ├── header.html - │ ├── modal-popup.html - │ ├── notebook_iframe.html - │ └── section.html - └── utils.py diff --git a/examples/flight_delays.py b/examples/flight_delays.py index 103871bb..ca84fa26 100644 --- a/examples/flight_delays.py +++ b/examples/flight_delays.py @@ -1,21 +1,28 @@ import pandas as pd -import popmon # noqa -from popmon import resources +import popmon +from popmon import Settings, resources # open synthetic data df = pd.read_csv( resources.data("flight_delays.csv.gz"), index_col=0, parse_dates=["DATE"] ) + +# Configuration of the monitoring rules and report +settings = Settings() +settings.report.extended_report = False +settings.monitoring.pull_rules = {"*_pull": [10, 7, -7, -10]} + # generate stability report using automatic binning of all encountered features # (importing popmon automatically adds this functionality to a dataframe) -report = df.pm_stability_report( +report = popmon.df_stability_report( + df, + reference_type="self", time_axis="DATE", time_width="1w", time_offset="2015-07-02", - extended_report=False, - pull_rules={"*_pull": [10, 7, -7, -10]}, + settings=settings, ) # or save the report to file diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py index 62fe981a..9e006701 100644 --- a/examples/synthetic_data.py +++ b/examples/synthetic_data.py @@ -8,7 +8,11 @@ # generate stability report using automatic binning of all encountered features # (importing popmon automatically adds this functionality to a dataframe) -report = df.pm_stability_report(time_axis="date", features=["date:age", "date:gender"]) +report = df.pm_stability_report( + time_axis="date", + time_width="2w", + features=["date:age", "date:gender", "date:isActive", "date:eyeColor"], +) # or save the report to file report.to_file("test_data_report.html") diff --git a/popmon/__init__.py b/popmon/__init__.py index bc734eac..ecd13edb 100644 --- a/popmon/__init__.py +++ b/popmon/__init__.py @@ -28,6 +28,7 @@ # pandas/spark dataframe decorators from popmon import decorators +from .config import Settings from .pipeline.metrics import df_stability_metrics, stability_metrics from .pipeline.report import df_stability_report, stability_report from .stitching import stitch_histograms @@ -44,4 +45,5 @@ "stability_report", "stitch_histograms", "__version__", + "Settings", ] diff --git a/popmon/alerting/alerts_summary.py b/popmon/alerting/alerts_summary.py index c3b74555..053c0c4e 100644 --- a/popmon/alerting/alerts_summary.py +++ b/popmon/alerting/alerts_summary.py @@ -93,9 +93,6 @@ def transform(self, data: dict) -> Optional[dict]: tlv = pd.concat(df_list, axis=1) dfc = pd.DataFrame(index=tlv.index) - # worst traffic light - cols = fnmatch.filter(tlv.columns, "*_worst") - dfc["worst"] = tlv[cols].values.max(axis=1) if len(cols) else 0 # colors of traffic lights for color in ["green", "yellow", "red"]: cols = fnmatch.filter(tlv.columns, f"*_n_{color}") diff --git a/popmon/alerting/compute_tl_bounds.py b/popmon/alerting/compute_tl_bounds.py index 0077b6f8..961e1601 100644 --- a/popmon/alerting/compute_tl_bounds.py +++ b/popmon/alerting/compute_tl_bounds.py @@ -33,7 +33,7 @@ def traffic_light_summary(row, cols=None, prefix=""): """Make a summary of traffic light alerts present in the dataframe - Count number of green, yellow, red traffic lights and worst value. + Count number of green, yellow and red traffic lights. Evaluate with df.apply(traffic_light_summary, axis=1) @@ -41,9 +41,7 @@ def traffic_light_summary(row, cols=None, prefix=""): :param list cols: list of cols to calculate traffic light summary of (optional) :param str prefix: prefix of traffic light columns, in case cols is empty. default is ``"tl_"`` """ - x = pd.Series( - {"worst": np.nan, "n_red": np.nan, "n_yellow": np.nan, "n_green": np.nan} - ) + x = {"n_red": np.nan, "n_yellow": np.nan, "n_green": np.nan} if cols is None or len(cols) == 0: # if no columns are given, find traffic light columns for which summary is made. @@ -53,14 +51,13 @@ def traffic_light_summary(row, cols=None, prefix=""): else row.index.to_list() ) if len(cols) == 0: - return x + return pd.Series(x) traffic_lights = np.array([row[c] for c in cols]) - x["worst"] = np.max(traffic_lights) x["n_red"] = (traffic_lights == 2).sum() x["n_yellow"] = (traffic_lights == 1).sum() x["n_green"] = (traffic_lights == 0).sum() - return x + return pd.Series(x) def traffic_light(value, red_high, yellow_high, yellow_low=0, red_low=0): @@ -174,13 +171,14 @@ def __init__( def get_description(self): return self.traffic_light_func.__name__ - def _set_traffic_lights(self, feature, cols, pattern, rule_name): + def _set_traffic_lights(self, feature, cols, pattern, rule): process_cols = fnmatch.filter(cols, pattern) for pcol in process_cols: name = feature + ":" + pcol if name not in self.traffic_lights: - bounds = self.monitoring_rules[eval(rule_name)] + key = rule(name, feature, pattern) + bounds = self.monitoring_rules[key] self.traffic_lights[name] = bounds metrics = ( [pcol] @@ -220,7 +218,10 @@ def transform(self, test_data: dict) -> Tuple[Any, Any]: # --- A1. tl bounds explicitly defined for a particular feature/profile combo self._set_traffic_lights( - feature, explicit_cols, pattern="*", rule_name="name" + feature, + explicit_cols, + pattern="*", + rule=lambda name, _, __: name, ) # --- B1. tl bounds implicitly defined for particular feature @@ -230,13 +231,16 @@ def transform(self, test_data: dict) -> Tuple[Any, Any]: feature, test_df.columns, pattern, - rule_name="feature + ':' + pattern", + rule=lambda _, feat, pat: f"{feat}:{pat}", ) # --- 2. tl bounds not explicitly defined for a particular feature, # see if a wildcard match can be found. for pattern in nkeys: self._set_traffic_lights( - feature, test_df.columns, pattern, rule_name="pattern" + feature, + test_df.columns, + pattern, + rule=lambda _, __, pat: pat, ) return self.traffic_lights, self.traffic_light_funcs @@ -283,7 +287,7 @@ def pull_bounds( required = [m + suffix_mean, m + suffix_std] assert all(r in row for r in required) - x = pd.Series() + x = {} for m in cols: x[m + "_red_high"] = np.nan x[m + "_yellow_high"] = np.nan @@ -296,7 +300,7 @@ def pull_bounds( x[m + "_yellow_high"] = row[m + suffix_mean] + row[m + suffix_std] * yellow_high x[m + "_yellow_low"] = row[m + suffix_mean] + row[m + suffix_std] * yellow_low x[m + "_red_low"] = row[m + suffix_mean] + row[m + suffix_std] * red_low - return x + return pd.Series(x) def df_single_op_pull_bounds( diff --git a/popmon/analysis/__init__.py b/popmon/analysis/__init__.py index a45ccb40..efe44800 100644 --- a/popmon/analysis/__init__.py +++ b/popmon/analysis/__init__.py @@ -19,5 +19,7 @@ from ..analysis.apply_func import ApplyFunc +from .comparison import Comparisons +from .profiling import Profiles -__all__ = ["ApplyFunc"] +__all__ = ["ApplyFunc", "Comparisons", "Profiles"] diff --git a/popmon/analysis/comparison/__init__.py b/popmon/analysis/comparison/__init__.py index 906f5b02..388ec3ef 100644 --- a/popmon/analysis/comparison/__init__.py +++ b/popmon/analysis/comparison/__init__.py @@ -16,8 +16,6 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - from ...analysis.comparison.hist_comparer import ( ExpandingHistComparer, ExpandingNormHistComparer, @@ -26,8 +24,10 @@ RollingHistComparer, RollingNormHistComparer, ) +from .comparisons import Comparisons __all__ = [ + "Comparisons", "ReferenceHistComparer", "RollingHistComparer", "ExpandingHistComparer", diff --git a/popmon/analysis/comparison/comparisons.py b/popmon/analysis/comparison/comparisons.py index eb88f2a0..510705d2 100644 --- a/popmon/analysis/comparison/comparisons.py +++ b/popmon/analysis/comparison/comparisons.py @@ -16,26 +16,264 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from typing import Callable -class Comparisons: - _comparison_descriptions = {} - _comparison_funcs = {} +import numpy as np +from scipy import stats - @classmethod - def register(cls, key: str, description: str): - def f(func: Callable): - cls._comparison_descriptions[key] = description - cls._comparison_funcs[key] = func - return func +from popmon.base.registry import Registry - return f +Comparisons = Registry() - @classmethod - def get_comparisons(cls): - return cls._comparison_funcs - @classmethod - def get_descriptions(cls): - return cls._comparison_descriptions +@Comparisons.register( + key="max_prob_diff", + description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})", + htype="all", +) +def googl_test(bins_1, bins_2): + """Google-paper test + + Reference link: https://mlsys.org/Conferences/2019/doc/2019/167.pdf + + :param bins_1: first array of bin entries + :param bins_2: second array of entries + + :return: maximum difference between the two entry distributions + :rtype: float + """ + + def dist(bins): + sum_ = np.sum(bins) + return bins / sum_ if sum_ else bins + + return np.max(np.abs(dist(bins_1) - dist(bins_2))) + + +@Comparisons.register(key="psi", description="Population Stability Index", htype="all") +def population_stability_index(po, qo): + epsilon = 10e-6 + p = po.copy() + q = qo.copy() + p += epsilon + q += epsilon + return np.sum((p - q) * np.log(p / q)) + + +def kullback_leibler_divergence(po, qo): + epsilon = 10e-6 + p = po.copy() + q = qo.copy() + p += epsilon + q += epsilon + return np.sum(p * np.log(p / q)) + + +@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence", htype="all") +def jensen_shannon_divergence(p, q): + m = 0.5 * (p + q) + return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m)) + + +def ks_test(hist_1, hist_2): + """KS-test for two histograms with different number of entries + + Copyright ROOT: + Formulas translated from c++ to python, but formulas otherwise not modified. + Reference: link: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest + GNU license: https://root.cern.ch/license + All modifications copyright ING WBAA. + + :param hist_1: 1D array with bin counts of the histogram_1 + :param hist_2: 1D array with bin counts of the histogram_2 + + :return: ks_score: Kolmogorov-Smirnov Test score + :rtype: float + """ + if len(hist_1) == 0 or len(hist_2) == 0: + raise ValueError("Input histogram(s) has zero size.") + if len(hist_1) != len(hist_2): + raise ValueError("Input histograms have unequal size.") + + sum_1 = np.sum(hist_1) + sum_2 = np.sum(hist_2) + if sum_1 == 0 or sum_2 == 0: + return np.nan + + normalized_cumsum_1 = np.cumsum(hist_1) / sum_1 + normalized_cumsum_2 = np.cumsum(hist_2) / sum_2 + + d = np.abs(normalized_cumsum_1 - normalized_cumsum_2) + + return np.max(d) * np.sqrt(sum_1 * sum_2 / (sum_1 + sum_2)) + + +def ks_prob(testscore): + """KS-probability corresponding ti KS test score + + Copyright ROOT: + Formulas translated from c++ to python, but formulas otherwise not modified. + Reference: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest + GNU license: https://root.cern.ch/license + All modifications copyright ING WBAA. + + :param float testscore: Kolmogorov-Smirnov test score + + :return: approximate pvalue for the Kolmogorov-Smirnov test score + :rtype: float + """ + fj = np.array([-2, -8, -18, -32]) + r = np.zeros(4) + + w = 2.50662827 + c = np.array([-1.2337005501361697, -11.103304951225528, -30.842513753404244]) + + u = abs(testscore) + pvalue = np.nan + if u < 0.2: + pvalue = 1 + elif u < 0.755: + v = np.power(u, -2) + pvalue = 1 - w * np.exp(c * v).sum() / u + elif u < 6.8116: + v = np.power(u, 2) + max_j = int(max(1, round(3.0 / u))) + r[:max_j] = np.exp(fj[:max_j] * v) + pvalue = 2 * (r[0] - r[1] + r[2] - r[3]) + + return pvalue + + +@Comparisons.register( + key=["ks", "ks_pvalue", "ks_zscore"], + description=[ + "Kolmogorov-Smirnov test statistic comparing each time slot to {ref}", + "p-value of the Kolmogorov-Smirnov test, comparing each time slot with {ref}", + "Z-score of the Kolmogorov-Smirnov test, comparing each time slot with {ref}", + ], + dim=1, + htype="num", +) +def ks(p, q, *args): + # KS-test only properly defined for (ordered) 1D interval variables + ks_testscore = ks_test(p, q) + ks_pvalue = ks_prob(ks_testscore) + ks_zscore = -stats.norm.ppf(ks_pvalue) + return ks_testscore, ks_pvalue, ks_zscore + + +@Comparisons.register( + key="unknown_labels", + description="Are categories observed in a given time slot that are not present in {ref}?", + dim=1, + htype="cat", +) +def unknown_labels(hist1, hist2): + # check consistency of bin_labels + labels1 = hist1.keySet + labels2 = hist2.keySet + subset = labels1 <= labels2 + return int(not subset) + + +@Comparisons.register( + key="pearson", + description="Pearson correlation between each time slot and {ref}", + dim=(2,), + htype="all", +) +def pearson(p, q, *args): + # calculate pearson coefficient + pearson_coeff = np.nan + if len(p) >= 2: + same0 = all(p == p[0]) + same1 = all(q == q[0]) + if not same0 and not same1: + # this avoids std==0, and thereby avoid runtime warnings + pearson_coeff, _ = stats.pearsonr(p, q) + return pearson_coeff + + +def uu_chi2(n, m): + """Normalized Chi^2 formula for two histograms with different number of entries + + Copyright ROOT: + Formulas translated from c++ to python, but formulas otherwise not modified. + Reference: https://root.cern.ch/doc/master/classTH1.html#a6c281eebc0c0a848e7a0d620425090a5 + GNU License: https://root.cern.ch/license + All modifications copyright ING WBAA. + + :param n: 1d array with bin counts of the reference set + :param m: 1d array with bin counts of the test set + :return: tuple of floats (chi2_value, chi2_norm, z_score, p_value, res) + """ + + def _not_finite_to_zero(x): + res = x.copy() + res[~np.isfinite(res)] = 0 + return res + + if len(n) == 0 or len(m) == 0: + raise ValueError("Input histogram(s) has zero size.") + if len(n) != len(m): + raise ValueError("Input histograms have unequal size.") + + N = np.sum(n) + M = np.sum(m) + + if N == 0 or M == 0: + return np.nan, np.nan, np.nan, np.nan, [0] * len(n) + + # remove all zero entries in the sum, to present division by zero for individual bins + z = n + m + n = n[z != 0] + m = m[z != 0] + + dof = ((n != 0) | (m != 0)).sum() - 1 + chi2_value = _not_finite_to_zero(((M * n - N * m) ** 2) / (n + m)).sum() / M / N + + chi2_norm = chi2_value / dof if dof > 0 else np.nan + p_value = stats.chi2.sf(chi2_value, dof) + z_score = -stats.norm.ppf(p_value) + + p = (n + m) / (N + M) + + if (p == 1).any(): + # unusual case of (only) one bin with p==1, avoids division with zero below + res = np.array([np.nan] * len(p)) + else: + res = _not_finite_to_zero( + (n - N * p) / np.sqrt(N * p) / np.sqrt((1 - N / (N + M)) * (1 - p)) + ) + + return chi2_value, chi2_norm, z_score, p_value, res + + +@Comparisons.register( + key=[ + "chi2", + "chi2_norm", + "chi2_zscore", + "chi2_pvalue", + "chi2_max_residual", + "chi2_spike_count", + ], + description=[ + "Chi-squared test statistic, comparing each time slot with {ref}", + "Normalized chi-squared statistic, comparing each time slot with {ref}", + "Z-score of the chi-squared statistic, comparing each time slot with {ref}", + "p-value of the chi-squared statistic, comparing each time slot with {ref}", + "The largest absolute normalized residual (|chi|) observed in all bin pairs " + + "(one histogram in a time slot and one in {ref})", + "The number of normalized residuals of all bin pairs (one histogram in a time" + + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).", + ], + htype="all", +) +def chi2(*args, max_res_bound=7.0): + chi2r, chi2_norm, zscore, pvalue, res = uu_chi2(*args) + abs_residual = np.abs(res) + chi2_max_residual = np.max(abs_residual) + chi2_spike_count = np.sum(abs_residual[abs_residual > max_res_bound]) + + return chi2r, chi2_norm, zscore, pvalue, chi2_max_residual, chi2_spike_count diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 63f22dd7..9dc25b31 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -20,7 +20,6 @@ import numpy as np import pandas as pd -from scipy.stats import norm, pearsonr from ...analysis.apply_func import ApplyFunc from ...analysis.functions import ( @@ -39,10 +38,9 @@ ) from ...base import Pipeline from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric -from ...stats.numpy import ks_prob, ks_test, uu_chi2 -def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): +def hist_compare(row, hist_name1="", hist_name2=""): """Function to compare two histograms Apply statistical tests to compare two input histograms, such as: @@ -52,28 +50,11 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): :param pd.Series row: row to apply compare function to :param str hist_name1: name of histogram one to compare :param str hist_name2: name of histogram two to compare - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. :return: pandas Series with popular comparison metrics. """ - from .comparisons import Comparisons - - x = { - "ks": np.nan, - "ks_zscore": np.nan, - "ks_pvalue": np.nan, - "pearson": np.nan, - "chi2": np.nan, - "chi2_norm": np.nan, - "chi2_zscore": np.nan, - "chi2_pvalue": np.nan, - "chi2_max_residual": np.nan, - "chi2_spike_count": np.nan, - "unknown_labels": np.nan, - } - - for key in Comparisons.get_comparisons().keys(): - x[key] = np.nan + from popmon.analysis.comparison import Comparisons + + x = {key: np.nan for key in Comparisons.get_keys()} # basic name checks cols = row.index.to_list() @@ -93,48 +74,27 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): # compare if hist1.n_dim == 1: + entries_list = get_consistent_numpy_entries([hist1, hist2]) if is_numeric(hist1): - # KS-test only properly defined for (ordered) 1D interval variables - entries_list = get_consistent_numpy_entries([hist1, hist2]) - ks_testscore = ks_test(*entries_list) - x["ks"] = ks_testscore - ks_pvalue = ks_prob(ks_testscore) - x["ks_pvalue"] = ks_pvalue - x["ks_zscore"] = -norm.ppf(ks_pvalue) - else: # categorical - entries_list = get_consistent_numpy_entries([hist1, hist2]) - # check consistency of bin_labels - labels1 = hist1.keySet - labels2 = hist2.keySet - subset = labels1 <= labels2 - x["unknown_labels"] = int(not subset) + htype = "num" + args = entries_list + else: + htype = "cat" + args = [hist1, hist2] + + x.update(Comparisons.run(args, dim=1, htype=htype)) + x.update(Comparisons.run(entries_list, dim=1, htype="all")) else: numpy_ndgrids = get_consistent_numpy_ndgrids([hist1, hist2], dim=hist1.n_dim) entries_list = [entry.flatten() for entry in numpy_ndgrids] - # calculate pearson coefficient - pearson, pvalue = (np.nan, np.nan) - if len(entries_list[0]) >= 2: - same0 = all(entries_list[0] == entries_list[0][0]) - same1 = all(entries_list[1] == entries_list[1][0]) - if not same0 and not same1: - # this avoids std==0, and thereby avoid runtime warnings - pearson, pvalue = pearsonr(*entries_list) - - chi2, chi2_norm, zscore, pvalue, res = uu_chi2(*entries_list) - abs_residual = np.abs(res) - chi2_max_residual = np.max(abs_residual) - chi2_spike_count = np.sum(abs_residual[abs_residual > max_res_bound]) - - x["pearson"] = pearson - x["chi2"] = chi2 - x["chi2_norm"] = chi2_norm - x["chi2_zscore"] = zscore - x["chi2_pvalue"] = pvalue - x["chi2_max_residual"] = chi2_max_residual - x["chi2_spike_count"] = chi2_spike_count - for key, func in Comparisons.get_comparisons().items(): - x[key] = func(*entries_list) + x.update(Comparisons.run(entries_list, dim=(2,), htype="all")) + + x.update(Comparisons.run(entries_list, dim=-1, htype="all")) + + if len(set(x.keys()) - set(Comparisons.get_keys())) > 0: + raise ValueError("Could not compute full comparison") + return pd.Series(x) @@ -149,7 +109,6 @@ def __init__( assign_to_key=None, hist_col="histogram", suffix="comp", - max_res_bound=7.0, *args, **kwargs, ): @@ -161,8 +120,6 @@ def __init__( :param str assign_to_key: key of the input data to assign function applied-output to. (optional) :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'roll' -> column = 'histogram_roll' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. :param args: (tuple, optional): residual args passed on to func_mean and func_std :param kwargs: (dict, optional): residual kwargs passed on to func_mean and func_std """ @@ -188,7 +145,6 @@ def __init__( "hist_name2": hist_col + "_" + suffix, "prefix": suffix, "axis": 1, - "max_res_bound": max_res_bound, } ], ) @@ -207,7 +163,6 @@ def __init__( shift=1, hist_col="histogram", suffix="roll", - max_res_bound=7.0, ): """Initialize an instance of RollingHistComparer. @@ -217,8 +172,6 @@ def __init__( :param int shift: shift of rolling window. default is 1. :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'roll' -> column = 'histogram_roll' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( rolling_hist, @@ -227,7 +180,6 @@ def __init__( read_key, hist_col, suffix, - max_res_bound, window=window, shift=shift, hist_name=hist_col, @@ -251,7 +203,6 @@ def __init__( store_key, hist_col="histogram", suffix="prev1", - max_res_bound=7.0, ): """Initialize an instance of PreviousHistComparer. @@ -259,8 +210,6 @@ def __init__( :param str store_key: key of output data to store in data store :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'prev' -> column = 'histogram_prev' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( read_key, @@ -269,7 +218,6 @@ def __init__( shift=1, hist_col=hist_col, suffix=suffix, - max_res_bound=max_res_bound, ) @@ -283,7 +231,6 @@ def __init__( shift=1, hist_col="histogram", suffix="expanding", - max_res_bound=7.0, ): """Initialize an instance of ExpandingHistComparer. @@ -292,8 +239,6 @@ def __init__( :param int shift: shift of rolling window. default is 1. :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'expanding' -> column = 'histogram_expanding' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( expanding_hist, @@ -302,7 +247,6 @@ def __init__( read_key, hist_col, suffix, - max_res_bound, shift=shift, hist_name=hist_col, ) @@ -325,7 +269,6 @@ def __init__( store_key, hist_col="histogram", suffix="ref", - max_res_bound=7.0, ): """Initialize an instance of ReferenceHistComparer. @@ -334,8 +277,6 @@ def __init__( :param str store_key: key of output data to store in data store :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'ref' -> column = 'histogram_ref' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( hist_sum, @@ -344,7 +285,6 @@ def __init__( assign_to_key, hist_col, suffix, - max_res_bound, metrics=[hist_col], ) self.reference_key = reference_key diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py index 6ee92ba7..d5506db8 100644 --- a/popmon/analysis/functions.py +++ b/popmon/analysis/functions.py @@ -421,12 +421,12 @@ def normalized_hist_mean_cov(x, hist_name=""): entries_list, binning = get_consistent_numpy_entries( hist_list, get_bin_labels=True ) - entries_list = np.array(entries_list, dtype=np.float) + entries_list = np.array(entries_list, dtype=float) else: entries_list, keys = get_consistent_numpy_2dgrids( hist_list, get_bin_labels=True ) - entries_list = np.array([h.flatten() for h in entries_list], dtype=np.float) + entries_list = np.array([h.flatten() for h in entries_list], dtype=float) binning = tuple(keys) # calculation of mean normalized histogram and its covariance matrix @@ -514,11 +514,11 @@ def relative_chi_squared( (norm_mean - single_norm), np.dot(pm, (norm_mean - single_norm)) ) if chi_squared <= 0: - chi_squared = np.finfo(np.float).eps + chi_squared = np.finfo(float).eps else: # If a covariance matrix is singular we fall back on using variances chi_squared = np.sum( - (norm_mean - single_norm) ** 2 / (variance + np.finfo(np.float).eps) + (norm_mean - single_norm) ** 2 / (variance + np.finfo(float).eps) ) # pvalue and zvalue based on naive number of degrees of freedom @@ -534,7 +534,7 @@ def relative_chi_squared( z_score = np.sqrt(u - np.log(u)) max_resid = np.max( - np.abs((norm_mean - single_norm) / np.sqrt(variance + np.finfo(np.float).eps)) + np.abs((norm_mean - single_norm) / np.sqrt(variance + np.finfo(float).eps)) ) x["chi2"] = chi_squared diff --git a/popmon/analysis/profiling/__init__.py b/popmon/analysis/profiling/__init__.py index df5229fa..5229425e 100644 --- a/popmon/analysis/profiling/__init__.py +++ b/popmon/analysis/profiling/__init__.py @@ -16,16 +16,16 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - from ...analysis.profiling.hist_profiler import HistProfiler from ...analysis.profiling.pull_calculator import ( ExpandingPullCalculator, ReferencePullCalculator, RollingPullCalculator, ) +from .profiles import Profiles __all__ = [ + "Profiles", "HistProfiler", "RollingPullCalculator", "ReferencePullCalculator", diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 4d5cfb03..9db85b43 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -21,137 +21,8 @@ import numpy as np import pandas as pd -from popmon.analysis.profiling.profiles import Profiles -from popmon.stats import numpy as pm_np - -from ...analysis.hist_numpy import get_2dgrid from ...base import Module -from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp, sum_entries - - -@Profiles.register( - key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"], - description=[ - "Minimum value", - "Maximum value", - "1% percentile", - "5% percentile", - "16% percentile", - "50% percentile (median)", - "84% percentile", - "95% percentile", - "99% percentile", - ], - dim=1, - htype="num", -) -def profile_quantiles(x, w): - return pm_np.quantile( - x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w - ) - - -@Profiles.register(key="mean", description="Mean value", dim=1, htype="num") -def profile_mean(x, w): - return pm_np.mean(x, w) - - -@Profiles.register(key="std", description="Standard deviation", dim=1, htype="num") -def profile_std(x, w): - return pm_np.std(x, w) - - -@Profiles.register(key="nan", description="Number of missing entries (NaN)", dim=1) -def profile_nan(hist): - if hasattr(hist, "nanflow"): - return hist.nanflow.entries - elif hasattr(hist, "bins") and "NaN" in hist.bins: - return hist.bins["NaN"].entries - return 0 - - -@Profiles.register( - key="overflow", - description="Number of values larger than the maximum bin-edge of the histogram.", - dim=1, -) -def profile_overflow(hist): - if hasattr(hist, "overflow"): - return hist.overflow.entries - return 0 - - -@Profiles.register( - key="underflow", - description="Number of values smaller than the minimum bin-edge of the histogram.", - dim=1, -) -def profile_underflow(hist): - if hasattr(hist, "underflow"): - return hist.underflow.entries - return 0 - - -@Profiles.register( - key="phik", - description="phi-k correlation between the two variables of the histogram", - dim=2, -) -def profile_phik(hist): - from phik import phik - - # calculate phik correlation - try: - grid = get_2dgrid(hist) - except Exception: - raise - - try: - phi_k = phik.phik_from_hist2d(observed=grid) - except ValueError: - # self.logger.debug( - # f"Not enough values in the 2d `{name}` time-split histogram to apply the phik test." - # ) - phi_k = np.nan - return phi_k - - -@Profiles.register( - key="count", description="Number of entries (non-NaN and NaN)", dim=None -) -def profile_count(hist): - return int(sum_entries(hist)) - - -@Profiles.register( - key="filled", - description="Number of non-missing entries (non-NaN)", - dim=1, - htype="all", -) -def profile_filled(_, bin_counts): - return bin_counts.sum() - - -@Profiles.register( - key="distinct", description="Number of distinct entries", dim=1, htype="all" -) -def profile_distinct(bin_labels, bin_counts): - return len(np.unique(bin_labels[bin_counts > 0])) - - -@Profiles.register( - key="fraction_of_true", description="", dim=1, htype="cat" -) # or type="bool" -def profile_fraction_of_true(bin_labels, bin_counts): - return pm_np.fraction_of_true(bin_labels, bin_counts) - - -@Profiles.register( - key="most_probable_value", description="Most probable value", dim=1, htype="all" -) -def profile_most_probable_value(bin_labels, bin_counts): - return bin_labels[np.argmax(bin_counts)] +from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp class HistProfiler(Module): @@ -203,6 +74,8 @@ def __init__( raise NotImplementedError() def _profile_1d_histogram(self, name, hist): + from popmon.analysis import Profiles + # preprocessing value counts and TS is_num = is_numeric(hist) is_ts = is_timestamp(hist) or name in self.var_timestamp @@ -222,23 +95,13 @@ def _profile_1d_histogram(self, name, hist): # calc 1d-histogram statistics profile = {} - for (key, htype), func in Profiles.get_profiles(dim=1).items(): - if htype is not None and htype != otype and htype != "all": - # skipping; type not applicable - continue - - if htype is None: - args = [hist] - else: - args = [bin_labels, bin_counts] + args = [bin_labels, bin_counts] - results = func(*args) + profile.update(Profiles.run(args, dim=1, htype=otype)) + profile.update(Profiles.run(args, dim=1, htype="all")) - if isinstance(key, (list, tuple)): - for k, v in zip(key, results): - profile[k] = v - else: - profile[key] = results + # difference between htype=None and htype="all" are arguments (bin labels vs hist) + profile.update(Profiles.run([hist], dim=1, htype=None)) # postprocessing TS if is_ts: @@ -253,6 +116,8 @@ def _profile_1d_histogram(self, name, hist): return profile def _profile_nd_histogram(self, name, hist, dim): + from popmon.analysis import Profiles + if hist.n_dim < dim: self.logger.warning( f"Histogram {name} has {hist.n_dim} dimensions (<{dim}); cannot profile. Returning empty." @@ -260,22 +125,17 @@ def _profile_nd_histogram(self, name, hist, dim): return {} # calc nd-histogram statistics - profile = {} - for (key, htype), func in Profiles.get_profiles(dim).items(): - if htype is None: - result = func(hist) - else: - raise NotImplementedError("histogram types for nD not implemented") - - if isinstance(key, (list, tuple)): - for k, v in zip(key, result): - profile[k] = v - else: - profile[key] = result + profile = Profiles.run([hist], dim=dim, htype=None) + profile.update(Profiles.run([hist], dim=dim, htype="all")) + profile.update(Profiles.run([hist], dim=dim, htype="num")) + profile.update(Profiles.run([hist], dim=dim, htype="cat")) + profile.update(Profiles.run([hist], dim=-1, htype=None)) return profile def _profile_hist(self, split, hist_name): + from popmon.analysis.profiling import Profiles + if len(split) == 0: self.logger.error(f'Split histograms dict "{hist_name}" is empty. Return.') return [] @@ -286,10 +146,20 @@ def _profile_hist(self, split, hist_name): htype = "num" if is_num else "cat" # these are the profiled quantities we will monitor - if dimension == 1: - expected_fields = Profiles.get_profile_keys(dim=1, htype=htype) - else: - expected_fields = Profiles.get_profile_keys(dim=dimension) + expected_fields = ( + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=htype) + + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype="all") + + Profiles.get_keys_by_dim_and_htype(dim=dimension, htype=None) + ) + + # profiles regardless of dim and htype (e.g. count) + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=None, htype=None) + + # profiles regardless of dim + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=htype) + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype="all") + expected_fields += Profiles.get_keys_by_dim_and_htype(dim=-1, htype=None) + expected_fields += [self.index_col, self.hist_col] # now loop over split-axis, e.g. time index, and profile each sub-hist x:y @@ -308,7 +178,8 @@ def _profile_hist(self, split, hist_name): if sorted(profile.keys()) != sorted(expected_fields): self.logger.error( - f'Could not extract full profile for sub-hist "{hist_name} {index}". Skipping.' + f'Could not extract full profile for sub-hist "{hist_name} {index}".' + f"Differences: {set(profile.keys()).symmetric_difference(set(expected_fields))}. Skipping." ) else: profile_list.append(profile) diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py index c72688fa..bc4d48da 100644 --- a/popmon/analysis/profiling/profiles.py +++ b/popmon/analysis/profiling/profiles.py @@ -16,86 +16,195 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from copy import copy -from typing import Callable, List, Optional, Tuple, Union - - -class Profiles: - _profile_descriptions = {} - _profile_funcs = {-1: {}} - - @classmethod - def register( - cls, - key: Union[str, List[str], Tuple[str]], - description: Union[str, List[str], Tuple[str]], - dim: Optional[int] = None, - htype: Optional[str] = None, - ): - if dim is None: - dim = -1 - if isinstance(key, list): - key = tuple(key) - - if isinstance(description, list): - description = tuple(description) - - def f(func: Callable): - if isinstance(key, tuple): - for k, d in zip(key, description): - cls._profile_descriptions[k] = d - else: - cls._profile_descriptions[key] = description - - if dim not in cls._profile_funcs: - cls._profile_funcs[dim] = {} - cls._profile_funcs[dim][(key, htype)] = func - return func - - return f - - @classmethod - def get_profiles( - cls, - dim: Optional[int] = None, - htype: Optional[str] = None, - ): - def merge(d1, d2): - x = copy(d1) - x.update(d2) - return x - - if dim is None: - v = cls._profile_funcs[-1] - else: - v = merge(cls._profile_funcs.get(dim, {}), cls._profile_funcs[-1]) - - return v - - @classmethod - def get_profile_keys( - cls, - dim: Optional[int] = None, - htype: Optional[str] = None, - ): - def flatten(input_list): - vals = [] - for v in input_list: - if isinstance(v, (list, tuple)): - for v2 in v: - vals.append(v2) - else: - vals.append(v) - return vals - - return flatten( - [ - k - for (k, dtype), v in cls.get_profiles(dim).items() - if dtype is None or htype is None or dtype == "all" or htype == dtype - ] + + +import numpy as np + +from popmon.base.registry import Registry + +from ...analysis.hist_numpy import get_2dgrid +from ...hist.hist_utils import sum_entries +from ...stats import numpy as pm_np + +Profiles = Registry() + + +@Profiles.register( + key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"], + description=[ + "Minimum value", + "Maximum value", + "1% percentile", + "5% percentile", + "16% percentile", + "50% percentile (median)", + "84% percentile", + "95% percentile", + "99% percentile", + ], + dim=1, + htype="num", +) +def profile_quantiles(x, w): + return tuple( + pm_np.quantile( + x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w ) + ) + + +@Profiles.register(key="mean", description="Mean value", dim=1, htype="num") +def profile_mean(x, w): + return pm_np.mean(x, w) + + +@Profiles.register(key="std", description="Standard deviation", dim=1, htype="num") +def profile_std(x, w): + return pm_np.std(x, w) + + +@Profiles.register( + key="nan", description="Number of missing entries (NaN)", dim=1, htype=None +) +def profile_nan(hist): + if hasattr(hist, "nanflow"): + return hist.nanflow.entries + elif hasattr(hist, "bins") and "NaN" in hist.bins: + return hist.bins["NaN"].entries + return 0 + + +@Profiles.register( + key="overflow", + description="Number of values larger than the maximum bin-edge of the histogram.", + dim=1, + htype=None, +) +def profile_overflow(hist): + if hasattr(hist, "overflow"): + return hist.overflow.entries + return 0 + + +@Profiles.register( + key="underflow", + description="Number of values smaller than the minimum bin-edge of the histogram.", + dim=1, + htype=None, +) +def profile_underflow(hist): + if hasattr(hist, "underflow"): + return hist.underflow.entries + return 0 + + +@Profiles.register( + key="phik", + description="phi-k correlation between the two variables of the histogram", + dim=2, + htype=None, +) +def profile_phik(hist): + from phik import phik + + # calculate phik correlation + try: + grid = get_2dgrid(hist) + except Exception: + raise + + try: + phi_k = phik.phik_from_hist2d(observed=grid) + except ValueError: + # self.logger.debug( + # f"Not enough values in the 2d `{name}` time-split histogram to apply the phik test." + # ) + phi_k = np.nan + return phi_k + + +@Profiles.register( + key="count", description="Number of entries (non-NaN and NaN)", dim=-1, htype=None +) +def profile_count(hist): + return int(sum_entries(hist)) + + +@Profiles.register( + key="filled", + description="Number of non-missing entries (non-NaN)", + dim=1, + htype="all", +) +def profile_filled(_, bin_counts): + return bin_counts.sum() + + +@Profiles.register( + key="distinct", description="Number of distinct entries", dim=1, htype="all" +) +def profile_distinct(bin_labels, bin_counts): + return len(np.unique(bin_labels[bin_counts > 0])) + + +@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat") +def profile_fraction_of_true(bin_labels, bin_counts): + """Compute fraction of 'true' labels + + :param bin_labels: Array containing numbers whose mean is desired. If `a` is not an + array, a conversion is attempted. + :param bin_entries: Array containing weights for the elements of `a`. If `weights` is not an + array, a conversion is attempted. + :return: fraction of 'true' labels + """ + bin_labels = np.array(bin_labels) + bin_entries = np.array(bin_counts) + assert len(bin_labels) == len(bin_entries) + + def replace(bl): + if bl in {"True", "true"}: + return True + elif bl in {"False", "false"}: + return False + return np.nan + + # basic checks: dealing with boolean labels + # also accept strings of 'True' and 'False' + if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: + return np.nan + if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): + if not np.all( + [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] + ): + return np.nan + # all strings from hereon + n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() + n_false = (bin_labels == "False").sum() + (bin_labels == "false").sum() + n_nan = ( + (bin_labels == "NaN").sum() + + (bin_labels == "nan").sum() + + (bin_labels == "None").sum() + + (bin_labels == "none").sum() + + (bin_labels == "Null").sum() + + (bin_labels == "null").sum() + ) + if n_true + n_false + n_nan != len(bin_labels): + return np.nan + # convert string to boolean + bin_labels = np.array([replace(bl) for bl in bin_labels]) + + sum_true = np.sum([be for bl, be in zip(bin_labels, bin_entries) if bl]) + sum_false = np.sum([be for bl, be in zip(bin_labels, bin_entries) if not bl]) + sum_entries = sum_true + sum_false + if sum_entries == 0: + # all nans scenario + return np.nan + # exclude nans from fraction + return (1.0 * sum_true) / sum_entries + - @classmethod - def get_descriptions(cls): - return cls._profile_descriptions +@Profiles.register( + key="most_probable_value", description="Most probable value", dim=1, htype="all" +) +def profile_most_probable_value(bin_labels, bin_counts): + return bin_labels[np.argmax(bin_counts)] diff --git a/popmon/base/module.py b/popmon/base/module.py index d46fa5e4..0f524a93 100644 --- a/popmon/base/module.py +++ b/popmon/base/module.py @@ -182,3 +182,11 @@ def transform(self, *args): :rtype: dict """ raise NotImplementedError + + def __repr__(self): + """String representation for modules when printing a pipeline/list of modules""" + name = self.__class__.__name__ + input_keys = [f"{v}='{getattr(self, v)}'" for v in self._input_keys] + output_keys = [f"{v}='{getattr(self, v)}'" for v in self._output_keys] + params = ", ".join(input_keys + output_keys) + return f"{name}({params})" diff --git a/popmon/base/pipeline.py b/popmon/base/pipeline.py index 20c009e4..292f3ea8 100644 --- a/popmon/base/pipeline.py +++ b/popmon/base/pipeline.py @@ -68,3 +68,12 @@ def transform(self, datastore): self.logger.debug(f"transform {module.__class__.__name__}") datastore = module.transform(datastore) return datastore + + def __repr__(self): + """String representation for pipeline""" + name = self.__class__.__name__ + ret = f"{name}: [\n" + for m in self.modules: + ret += "\t" + str(m).replace("\n", "\n\t") + "\n" + ret += "]" + return ret diff --git a/popmon/base/registry.py b/popmon/base/registry.py new file mode 100644 index 00000000..1bc2b7db --- /dev/null +++ b/popmon/base/registry.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022 ING Wholesale Banking Advanced Analytics +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +from collections import defaultdict +from typing import Callable, Dict, List, Optional, Tuple, Union + + +class Registry: + _properties = ("dim", "htype") + + def __init__(self): + self._keys: List[str] = [] + self._descriptions: Dict[str, str] = {} + self._properties_to_func = defaultdict(lambda: defaultdict(dict)) + self._func_name_to_properties = {} + + def register( + self, + key: Union[str, List[str], Tuple[str]], + description: Union[str, List[str], Tuple[str]], + dim: int = -1, + htype: Optional[str] = None, + ): + # rename for function use, without changing api + keys = key + del key + + descriptions = description + del description + + # ensure that keys are a tuple + if isinstance(keys, list): + keys = tuple(keys) + elif not isinstance(keys, tuple): + keys = (keys,) + + # ensure that description is a tuple + if isinstance(descriptions, list): + descriptions = tuple(descriptions) + elif not isinstance(descriptions, tuple): + descriptions = (descriptions,) + + def f(func: Callable): + # function names should be unique + if func.__name__ in self._func_name_to_properties: + raise ValueError( + f"A function with the name '{func.__name__}' has already been registered." + ) + + # keys should unique correspond to a function + for key in keys: + if key in self._keys: + raise ValueError(f"Key '{key}' has already been registered.") + + # register properties + self._keys += list(keys) + self._func_name_to_properties[func.__name__] = (dim, htype, keys) + self._properties_to_func[dim][htype][keys] = func + self._descriptions.update(dict(zip(keys, descriptions))) + + return func + + return f + + # Methods + def _get_func_properties_by_name( + self, function_name: str + ) -> Tuple[int, str, Tuple[str]]: + return self._func_name_to_properties[function_name] + + def get_func_by_name(self, function_name: str) -> Callable: + """ + Get a function by the function name + + Parameters + ---------- + function_name: name of the original function + """ + dim, htype, key = self._get_func_properties_by_name(function_name) + return self._properties_to_func[dim][htype][key] + + def get_func_by_dim_and_htype(self, dim, htype) -> Dict[Tuple[str], Callable]: + return self._properties_to_func[dim][htype] + + def get_keys(self) -> List[str]: + """List of keys associated with registered functions""" + return self._keys + + def get_keys_by_dim_and_htype(self, dim, htype) -> List[str]: + """Flat list of keys for a provided dimension and histogram type""" + return [ + v for values in self._properties_to_func[dim][htype].keys() for v in values + ] + + def get_descriptions(self) -> Dict[str, str]: + """Dictionary of key->description associated with registered functions""" + return self._descriptions + + def update_func(self, name, func) -> None: + dim, htype, key = self._func_name_to_properties[name] + self._properties_to_func[dim][htype][key] = func + + def run(self, args, dim, htype): + output = {} + for key, func in self.get_func_by_dim_and_htype(dim=dim, htype=htype).items(): + results = func(*args) + if not isinstance(results, tuple): + results = (results,) + output.update(dict(zip(key, results))) + return output diff --git a/popmon/config.py b/popmon/config.py index b1cef3e2..ca8a3641 100644 --- a/popmon/config.py +++ b/popmon/config.py @@ -16,66 +16,153 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +from pathlib import Path +from typing import Dict, List, Optional, Union -from popmon.analysis.comparison.comparisons import Comparisons -from popmon.analysis.profiling.profiles import Profiles - -profiles = Profiles.get_descriptions() - - -comparisons = { - "ks": "Kolmogorov-Smirnov test statistic comparing each time slot to {ref}", - "ks_zscore": "Z-score of the Kolmogorov-Smirnov test, comparing each time slot with {ref}", - "ks_pvalue": "p-value of the Kolmogorov-Smirnov test, comparing each time slot with {ref}", - "pearson": "Pearson correlation between each time slot and {ref}", - "chi2": "Chi-squared test statistic, comparing each time slot with {ref}", - "chi2_norm": "Normalized chi-squared statistic, comparing each time slot with {ref}", - "chi2_pvalue": "p-value of the chi-squared statistic, comparing each time slot with {ref}", - "chi2_zscore": "Z-score of the chi-squared statistic, comparing each time slot with {ref}", - "chi2_max_residual": "The largest absolute normalized residual (|chi|) observed in all bin pairs " - + "(one histogram in a time slot and one in {ref})", - "chi2_spike_count": "The number of normalized residuals of all bin pairs (one histogram in a time" - + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).", - "unknown_labels": "Are categories observed in a given time slot that are not present in {ref}?", -} -comparisons.update(Comparisons.get_descriptions()) - -references = { - "ref": "the reference data", - "roll": "a rolling window", - "prev1": "the preceding time slot", - "expanding": "all preceding time slots", -} - -alerts = { - "n_green": "Total number of green traffic lights (observed for all statistics)", - "n_yellow": "Total number of yellow traffic lights (observed for all statistics)", - "n_red": "Total number of red traffic lights (observed for all statistics)", - "worst": "Worst traffic light (observed for all statistics)", -} - -section_descriptions = { - "profiles": """Basic statistics of the data (profiles) calculated for each time period (a period - is represented by one bin). The yellow and red lines represent the corresponding - traffic light bounds (default: 4 and 7 standard deviations with respect to the reference data).""", - "comparisons": "Statistical comparisons of each time period (one bin) to the reference data.", - "traffic_lights": "Traffic light calculation for different statistics (based on the calculated normalized residual, a.k.a. pull). Statistics for which all traffic lights are green are hidden from view by default.", - "alerts": "Alerts aggregated by all traffic lights for each feature.", - "histograms": "Histograms of the last few time slots (default: 2).", - "overview": "Alerts aggregated per feature", -} - -histograms = { - "heatmap": "The heatmap shows the frequency of each value over time. If a variable has a high number of distinct values" - "(i.e. has a high cardinality), then the most frequent values are displayed and the remaining are grouped as 'Others'. " - "The maximum number of values to should is configurable (default: 20).", - "heatmap_column_normalized": "The column-normalized heatmap allows for comparing of time bins when the counts in each bin vary.", - "heatmap_row_normalized": "The row-normalized heatmaps allows for monitoring one value over time.", -} - -config = { - "section_descriptions": section_descriptions, - "limited_stats": [ +from pydantic import BaseModel, BaseSettings + +# Global configuration for the joblib parallelization. Could be used to change the number of jobs, and/or change +# the backend from default (loki) to 'multiprocessing' or 'threading'. +# (see https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html for details) +parallel_args = {"n_jobs": 1} + +# Usage the `ing_matplotlib_theme` +themed = True + + +class SectionModel(BaseModel): + name: str + """Name of the section in the report""" + + description: str + """Description of the section in the report""" + + +class ProfilesSection(SectionModel): + name = "Profiles" + """Name of the profiles section in the report""" + + description = """Basic statistics of the data (profiles) calculated for each time period (a period + is represented by one bin). The yellow and red lines represent the corresponding + traffic light bounds (default: 4 and 7 standard deviations with respect to the reference data).""" + """Description of the profiles section in the report""" + + +class AlertSection(SectionModel): + name = "Alerts" + """Name of the alerts section in the report""" + + description = "Alerts aggregated by all traffic lights for each feature." + """Description of the alerts section in the report""" + + descriptions = { + "n_green": "Total number of green traffic lights (observed for all statistics)", + "n_yellow": "Total number of yellow traffic lights (observed for all statistics)", + "n_red": "Total number of red traffic lights (observed for all statistics)", + } + """Descriptions of the individual alerts""" + + +class HistogramSectionModel(SectionModel): + name = "Histograms" + """Name of the histograms section in the report""" + + description = "Histograms of the last few time slots (default: 2)." + """Description of the histograms section in the report""" + + hist_names: List[str] = [ + "heatmap", + "heatmap_column_normalized", + "heatmap_row_normalized", + ] + """Heatmaps of histograms to display in the report""" + + hist_names_formatted = { + "heatmap": "Heatmap", + "heatmap_column_normalized": "Column-Normalized Heatmap", + "heatmap_row_normalized": "Row-Normalized Heatmap", + } + """Pretty-print names for the heatmaps""" + + descriptions = { + "heatmap": "The heatmap shows the frequency of each value over time. If a variable has a high number of distinct values" + "(i.e. has a high cardinality), then the most frequent values are displayed and the remaining are grouped as 'Others'. " + "The maximum number of values to should is configurable (default: 20).", + "heatmap_column_normalized": "The column-normalized heatmap allows for comparing of time bins when the counts in each bin vary.", + "heatmap_row_normalized": "The row-normalized heatmaps allows for monitoring one value over time.", + } + """Descriptions of the heatmaps in the report""" + + plot_hist_n: int = 2 + """plot histograms for last 'n' periods. default is 2 (optional)""" + + top_n: int = 20 + """plot heatmap for top 'n' categories. default is 20 (optional)""" + + cmap: str = "autumn_r" + """colormap for histogram heatmaps""" + + +class TrafficLightsSection(SectionModel): + name = "Traffic Lights" + """Name of the traffic lights section in the report""" + + description = "Traffic light calculation for different statistics (based on the calculated normalized residual, a.k.a. pull). Statistics for which all traffic lights are green are hidden from view by default." + """Description of the traffic lights section in the report""" + + +class ComparisonsSection(SectionModel): + name = "Comparisons" + """Name of the comparisons section in the report""" + + description = ( + "Statistical comparisons of each time period (one bin) to the reference data." + ) + """Description of the comparisons section in the report""" + + +class OverviewSection(SectionModel): + name = "Overview" + """Name of the overview section in the report""" + + description = "Alerts aggregated per feature" + """Description of the overview section in the report""" + + +class Section(BaseModel): + """Configuration for the individual sections""" + + profiles: ProfilesSection = ProfilesSection() + alerts: AlertSection = AlertSection() + histograms: HistogramSectionModel = HistogramSectionModel() + overview: OverviewSection = OverviewSection() + comparisons: ComparisonsSection = ComparisonsSection() + traffic_lights: TrafficLightsSection = TrafficLightsSection() + + +class Report(BaseModel): + """Report-specific configuration""" + + skip_empty_plots: bool = True + """if false, also show empty plots in report with only nans or zeroes (optional)""" + + last_n: int = 0 + """plot statistic data for last 'n' periods (optional)""" + + skip_first_n: int = 0 + """in plot skip first 'n' periods. last_n takes precedence (optional)""" + + skip_last_n: int = 0 + """in plot skip last 'n' periods. last_n takes precedence (optional)""" + + report_filepath: Optional[Union[str, Path]] = None + """the file path where to output the report (optional)""" + + extended_report: bool = True + """if True, show all the generated statistics in the report (optional) + if set to False, then smaller show_stats (see below)""" + + show_stats: List[str] = [ "distinct*", "filled*", "nan*", @@ -90,46 +177,89 @@ "phik*", "*unknown_labels*", "*chi2_norm*", - "*ks*", "*zscore*", "n_*", - "worst", - ], -} -for key in Comparisons.get_comparisons().keys(): - config["limited_stats"].append(f"*{key}*") + "*jsd*", + "*psi*", + "*max_prob_diff*", + ] + """list of statistic name patterns to show in the report. If None, show all (optional)""" + + section: Section = Section() + """Configuration for the individual sections""" + + +class Comparison(BaseModel): + """Parameters related to comparisons""" + window: int = 10 + """size of rolling window and/or trend detection. default is 10.""" -def get_stat_description(name: str): - """Gets the description of a statistic. + shift: int = 1 + """shift of time-bins in rolling/expanding window. default is 1.""" - :param str name: the name of the statistic. - :returns str: the description of the statistic. If not found, returns an empty string +class Monitoring(BaseModel): + """Parameters related to monitoring""" + + monitoring_rules: Dict[str, List[Union[float, int]]] = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } """ - if not isinstance(name, str): - raise TypeError("Statistic's name should be a string.") + monitoring rules to generate traffic light alerts. + The default setting is: - if name in histograms: - return histograms[name] - if name in profiles: - return profiles[name] - if name in alerts: - return alerts[name] + .. code-block:: python - head, *tail = name.split("_") - tail = "_".join(tail) + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } - if tail in comparisons and head in references: - return comparisons[tail].format(ref=references[head]) + Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. + For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". + You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the + feature name in front. E.g. - return "" + .. code-block:: python + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } -# Global configuration for the joblib parallelization. Could be used to change the number of jobs, and/or change -# the backend from default (loki) to 'multiprocessing' or 'threading'. -# (see https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html for details) -parallel_args = {"n_jobs": 1} + In case of multiple rules could apply for a feature's statistic, the most specific one applies. + So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule + for all other features. + """ -# Usage the `ing_matplotlib_theme` -themed = True + pull_rules: Dict[str, List[Union[float, int]]] = {"*_pull": [7, 4, -4, -7]} + """ + red and yellow (possibly dynamic) boundaries shown in plots in the report. + Default is: + + .. code-block:: python + + pull_rules = {"*_pull": [7, 4, -4, -7]} + + This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, + and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. + Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. + (The same string logic applies as for monitoring_rules.) + """ + + +class Settings(BaseSettings): + report: Report = Report() + """Settings regarding the report""" + + comparison: Comparison = Comparison() + """Settings related to the comparisons""" + + monitoring: Monitoring = Monitoring() + """Settings related to monitoring""" diff --git a/popmon/notebooks/popmon_tutorial_advanced.ipynb b/popmon/notebooks/popmon_tutorial_advanced.ipynb index 7272761b..9739176e 100644 --- a/popmon/notebooks/popmon_tutorial_advanced.ipynb +++ b/popmon/notebooks/popmon_tutorial_advanced.ipynb @@ -11,7 +11,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -35,6 +34,7 @@ "source": [ "# install popmon (if not installed yet)\n", "import sys\n", + "\n", "!\"{sys.executable}\" -m pip install -q popmon" ] }, @@ -47,7 +47,8 @@ "import pandas as pd\n", "\n", "import popmon\n", - "from popmon import resources" + "from popmon import resources\n", + "from popmon.config import Report, Settings" ] }, { @@ -64,7 +65,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"])" + "df = pd.read_csv(\n", + " resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n", + ")" ] }, { @@ -103,8 +106,11 @@ "metadata": {}, "outputs": [], "source": [ + "settings = Settings()\n", + "settings.report.extended_report = False\n", + "\n", "df.pm_stability_report(\n", - " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", extended_report=False\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", settings=settings\n", ")" ] }, @@ -122,12 +128,14 @@ "metadata": {}, "outputs": [], "source": [ + "# reuse the previous settings\n", + "settings.monitoring.pull_rules = {\"*_pull\": [10, 7, -7, -10]}\n", + "\n", "df.pm_stability_report(\n", " time_axis=\"DATE\",\n", " time_width=\"1w\",\n", " time_offset=\"2015-07-02\",\n", - " extended_report=False,\n", - " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + " settings=settings,\n", ")" ] }, @@ -152,8 +160,8 @@ "source": [ "# download histogrammar jar files if not already installed, used for histogramming of spark dataframe\n", "try:\n", - " from pyspark.sql import SparkSession\n", " from pyspark import __version__ as pyspark_version\n", + " from pyspark.sql import SparkSession\n", "\n", " pyspark_installed = True\n", "except ImportError:\n", @@ -168,21 +176,24 @@ "outputs": [], "source": [ "if pyspark_installed:\n", - " scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n", - " hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n", - " hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n", - " \n", + " scala = \"2.12\" if int(pyspark_version[0]) >= 3 else \"2.11\"\n", + " hist_jar = f\"io.github.histogrammar:histogrammar_{scala}:1.0.20\"\n", + " hist_spark_jar = f\"io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20\"\n", + "\n", " spark = SparkSession.builder.config(\n", - " \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n", + " \"spark.jars.packages\", f\"{hist_spark_jar},{hist_jar}\"\n", " ).getOrCreate()\n", "\n", " sdf = spark.createDataFrame(df)\n", "\n", + " settings = Settings()\n", + " settings.report.extended_report = False\n", + "\n", " sdf.pm_stability_report(\n", " time_axis=\"DATE\",\n", " time_width=\"1w\",\n", " time_offset=\"2015-07-02\",\n", - " extended_report=False,\n", + " settings=settings,\n", " )" ] }, @@ -204,6 +215,9 @@ "metadata": {}, "outputs": [], "source": [ + "settings = Settings()\n", + "settings.report.extended_report = False\n", + "\n", "df_ref = pd.read_csv(\n", " resources.data(\"flight_delays_reference.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n", ")\n", @@ -211,9 +225,9 @@ " time_axis=\"DATE\",\n", " time_width=\"1w\",\n", " time_offset=\"2015-07-02\",\n", - " extended_report=False,\n", " reference_type=\"external\",\n", " reference=df_ref,\n", + " settings=settings,\n", ")" ] }, @@ -235,8 +249,8 @@ " time_axis=\"DATE\",\n", " time_width=\"1w\",\n", " time_offset=\"2015-07-02\",\n", - " extended_report=False,\n", " reference_type=\"expanding\",\n", + " settings=settings,\n", ")" ] }, @@ -255,13 +269,14 @@ "metadata": {}, "outputs": [], "source": [ + "settings.comparison.window = 5\n", + "\n", "df.pm_stability_report(\n", " time_axis=\"DATE\",\n", " time_width=\"1w\",\n", " time_offset=\"2015-07-02\",\n", - " extended_report=False,\n", " reference_type=\"rolling\",\n", - " window=5,\n", + " settings=settings,\n", ")" ] }, @@ -400,15 +415,18 @@ "metadata": {}, "outputs": [], "source": [ + "report_settings = Report()\n", + "report_settings.last_n = 0\n", + "report_settings.skip_first_n = 0\n", + "report_settings.skip_last_n = 0\n", + "report_settings.section.histograms.plot_hist_n = 2\n", + "report_settings.skip_empty_plots = True\n", + "report_settings.report_filepath = None\n", + "\n", "report.regenerate(\n", - " last_n=0,\n", - " skip_first_n=0,\n", - " skip_last_n=0,\n", - " plot_hist_n=2,\n", - " skip_empty_plots=True,\n", - " report_filepath=None,\n", " store_key=\"html_report\",\n", " sections_key=\"report_sections\",\n", + " report_settings=report_settings,\n", ")" ] }, @@ -426,29 +444,31 @@ "metadata": {}, "outputs": [], "source": [ - "from popmon.hist.hist_splitter import HistSplitter\n", "from popmon.analysis.profiling import HistProfiler\n", - "from popmon.pipeline.report import StabilityReport\n", "from popmon.base import Pipeline\n", - "from popmon.visualization import SectionGenerator, ReportGenerator\n", + "from popmon.hist.hist_splitter import HistSplitter\n", + "from popmon.pipeline.report import StabilityReport\n", + "from popmon.visualization import ReportGenerator, SectionGenerator\n", "\n", - "monitoring_rules = {\n", - " \"*_pull\": [7, 4, -4, -7],\n", - " \"*_zscore\": [7, 4, -4, -7],\n", - " \"[!p]*_unknown_labels\": [0.5, 0.5, 0, 0],\n", - "}\n", "datastore = {\n", - " \"hists\": df.pm_make_histograms(time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\")\n", + " \"hists\": df.pm_make_histograms(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", + " )\n", "}\n", "\n", "\n", "class CustomPipeline(Pipeline):\n", " def __init__(self):\n", " modules = [\n", - " HistSplitter(read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"),\n", + " HistSplitter(\n", + " read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"\n", + " ),\n", " HistProfiler(read_key=\"split_hists\", store_key=\"profiles\"),\n", " SectionGenerator(\n", - " section_name=\"Profiles\", read_key=\"profiles\", store_key=\"report_sections\"\n", + " section_name=\"Profiles\",\n", + " read_key=\"profiles\",\n", + " store_key=\"report_sections\",\n", + " settings=report_settings,\n", " ),\n", " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\"),\n", " ]\n", @@ -477,16 +497,19 @@ "source": [ "from popmon.analysis.comparison.hist_comparer import ReferenceHistComparer\n", "\n", - "\n", "datastore = {\n", - " \"hists\": df.pm_make_histograms(time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\")\n", + " \"hists\": df.pm_make_histograms(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", + " )\n", "}\n", "\n", "\n", "class CustomComparisonsPipeline(Pipeline):\n", " def __init__(self):\n", " modules = [\n", - " HistSplitter(read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"),\n", + " HistSplitter(\n", + " read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"\n", + " ),\n", " HistProfiler(read_key=\"split_hists\", store_key=\"profiles\"),\n", " ReferenceHistComparer(\n", " reference_key=\"split_hists\",\n", @@ -494,16 +517,22 @@ " store_key=\"comparisons\",\n", " ),\n", " SectionGenerator(\n", - " section_name=\"Profiles\", read_key=\"profiles\", store_key=\"report_sections\"\n", + " section_name=\"Profiles\",\n", + " read_key=\"profiles\",\n", + " store_key=\"report_sections\",\n", + " settings=report_settings,\n", " ),\n", " SectionGenerator(\n", - " section_name=\"Comparisons\", read_key=\"comparisons\", store_key=\"report_sections\"\n", + " section_name=\"Comparisons\",\n", + " read_key=\"comparisons\",\n", + " store_key=\"report_sections\",\n", + " settings=report_settings,\n", " ),\n", " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\"),\n", " ]\n", " super().__init__(modules)\n", "\n", - " \n", + "\n", "pipeline = CustomComparisonsPipeline()\n", "datastore = pipeline.transform(datastore)\n", "\n", @@ -556,7 +585,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.8" }, "nteract": { "version": "0.15.0" diff --git a/popmon/notebooks/popmon_tutorial_basic.ipynb b/popmon/notebooks/popmon_tutorial_basic.ipynb index 3c086ea0..97a153d9 100644 --- a/popmon/notebooks/popmon_tutorial_basic.ipynb +++ b/popmon/notebooks/popmon_tutorial_basic.ipynb @@ -18,7 +18,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -29,7 +28,7 @@ "outputs": [], "source": [ "# (optional) Adjust the jupyter notebook style for easier navigation of the reports\n", - "from IPython.core.display import display, HTML\n", + "from IPython.core.display import HTML, display\n", "\n", "# Wider notebook\n", "display(HTML(\"\"))\n", @@ -58,6 +57,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "!\"{sys.executable}\" -m pip install -q popmon" ] }, @@ -75,8 +75,10 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "import popmon\n", - "from popmon import resources" + "from popmon import resources\n", + "from popmon.config import Report" ] }, { @@ -149,7 +151,11 @@ }, "outputs": [], "source": [ - "report.regenerate(extended_report=False, plot_hist_n=3)" + "report_settings = Report()\n", + "report_settings.extended_report = False\n", + "report_settings.section.histograms.plot_hist_n = 6\n", + "\n", + "report.regenerate(report_settings=report_settings)" ] }, { diff --git a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb index ed30289e..450dbc17 100644 --- a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb +++ b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb @@ -37,6 +37,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "!\"{sys.executable}\" -m pip install -q popmon" ] }, @@ -49,8 +50,7 @@ "import pandas as pd\n", "\n", "import popmon\n", - "from popmon import stability_report, stitch_histograms, get_bin_specs\n", - "from popmon import resources" + "from popmon import get_bin_specs, resources, stability_report, stitch_histograms" ] }, { diff --git a/popmon/notebooks/popmon_tutorial_reports.ipynb b/popmon/notebooks/popmon_tutorial_reports.ipynb index c2c54057..3340c181 100644 --- a/popmon/notebooks/popmon_tutorial_reports.ipynb +++ b/popmon/notebooks/popmon_tutorial_reports.ipynb @@ -30,7 +30,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"])\n", + "df = pd.read_csv(\n", + " resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n", + ")\n", "report = df.pm_stability_report(time_axis=\"DATE\", time_width=\"1w\")" ] }, @@ -67,7 +69,7 @@ "metadata": {}, "outputs": [], "source": [ - "list(report.datastore['report_sections'][0].keys())" + "list(report.datastore[\"report_sections\"][0].keys())" ] }, { @@ -85,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "[section['section_title'] for section in report.datastore['report_sections']]" + "[section[\"section_title\"] for section in report.datastore[\"report_sections\"]]" ] }, { @@ -103,15 +105,17 @@ "metadata": {}, "outputs": [], "source": [ - "from IPython.core.display import display, HTML\n", + "from IPython.core.display import HTML, display\n", + "\n", "\n", "def show_image(plot):\n", " display(HTML(f''))\n", " text = f'{plot[\"name\"]}'\n", - " if plot['description']:\n", + " if plot[\"description\"]:\n", " text += f': {plot[\"description\"]}'\n", " display(HTML(text))\n", "\n", + "\n", "def show_table(plot):\n", " style = \"\"\"table.overview{\n", " margin: 25px;\n", @@ -147,7 +151,7 @@ " font-weight: 300;\n", " }\n", " \"\"\"\n", - " display(HTML(f''))\n", + " display(HTML(f\"\"))\n", " display(HTML(plot[\"plot\"]))" ] }, @@ -187,7 +191,7 @@ "outputs": [], "source": [ "# First section, First Feature, First plot\n", - "show_image(report.datastore['report_sections'][1]['features'][0]['plots'][0])" + "show_image(report.datastore[\"report_sections\"][1][\"features\"][0][\"plots\"][0])" ] }, { @@ -216,7 +220,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_image(report.datastore['report_sections'][1]['features'][1]['plots'][0])" + "show_image(report.datastore[\"report_sections\"][1][\"features\"][1][\"plots\"][0])" ] }, { @@ -266,7 +270,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_image(report.datastore['report_sections'][1]['features'][0]['plots'][2])" + "show_image(report.datastore[\"report_sections\"][1][\"features\"][0][\"plots\"][2])" ] }, { @@ -299,7 +303,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_table(report.datastore['report_sections'][2]['features'][0]['plots'][0])" + "show_table(report.datastore[\"report_sections\"][2][\"features\"][0][\"plots\"][0])" ] }, { @@ -325,7 +329,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_table(report.datastore['report_sections'][3]['features'][0]['plots'][0])" + "show_table(report.datastore[\"report_sections\"][3][\"features\"][0][\"plots\"][0])" ] }, { @@ -351,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_image(report.datastore['report_sections'][4]['features'][0]['plots'][0])" + "show_image(report.datastore[\"report_sections\"][4][\"features\"][0][\"plots\"][0])" ] }, { @@ -377,7 +381,7 @@ "metadata": {}, "outputs": [], "source": [ - "show_image(report.datastore['report_sections'][5]['features'][0]['plots'][0])" + "show_image(report.datastore[\"report_sections\"][5][\"features\"][0][\"plots\"][0])" ] } ], diff --git a/popmon/pipeline/amazing_pipeline.py b/popmon/pipeline/amazing_pipeline.py index 90015dd7..fcb626fc 100644 --- a/popmon/pipeline/amazing_pipeline.py +++ b/popmon/pipeline/amazing_pipeline.py @@ -23,7 +23,6 @@ from popmon import resources from ..base import Pipeline -from ..config import config from ..io import JsonReader from ..pipeline.report_pipelines import SelfReference @@ -47,7 +46,6 @@ def run(): ) cfg = { - **config, "histograms_path": resources.data("synthetic_histograms.json"), "hists_key": "hists", "ref_hists_key": "hists", @@ -60,7 +58,6 @@ def run(): "*_zscore": [7, 4, -4, -7], }, "pull_rules": {"*_pull": [7, 4, -4, -7]}, - "show_stats": config["limited_stats"], } pipeline = AmazingPipeline(**cfg) diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index b6b72d85..d0a7f654 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -27,6 +27,7 @@ make_histograms, ) +from ..config import Settings from ..pipeline.metrics_pipelines import create_metrics_pipeline logging.basicConfig( @@ -37,13 +38,10 @@ def stability_metrics( hists, + settings: Settings, reference_type="self", reference=None, time_axis="", - window=10, - shift=1, - monitoring_rules=None, - pull_rules=None, features=None, **kwargs, ): @@ -54,47 +52,6 @@ def stability_metrics( default is 'self'. :param reference: histograms used as reference. default is None :param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided. - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) :param list features: histograms to pick up from the 'hists' dictionary (default is all keys) :param kwargs: residual keyword arguments passed on to report pipeline. :return: dict with results of metrics pipeline @@ -103,15 +60,6 @@ def stability_metrics( if not isinstance(hists, dict): raise TypeError("hists should be a dict of histogrammar histograms.") - if not isinstance(monitoring_rules, dict): - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - if not isinstance(pull_rules, dict): - pull_rules = {"*_pull": [7, 4, -4, -7]} - if (isinstance(time_axis, str) and len(time_axis) == 0) or ( isinstance(time_axis, bool) and time_axis ): @@ -119,16 +67,15 @@ def stability_metrics( first_cols = [k.split(":")[0] for k in list(hists.keys())] time_axis = max(set(first_cols), key=first_cols.count) + if reference_type == "external" and "ref_hists_key" not in kwargs: + kwargs["ref_hists_key"] = "ref_hists" + pipeline = create_metrics_pipeline( + settings=settings, reference_type=reference_type, reference=reference, hists_key="hists", - ref_hists_key="ref_hists", time_axis=time_axis, - window=window, - shift=shift, - monitoring_rules=monitoring_rules, - pull_rules=pull_rules, features=features, **kwargs, ) @@ -143,6 +90,7 @@ def stability_metrics( def df_stability_metrics( df, time_axis, + settings: Settings = None, features=None, binning="auto", bin_specs=None, @@ -151,10 +99,6 @@ def df_stability_metrics( var_dtype=None, reference_type="self", reference=None, - window=10, - shift=1, - monitoring_rules=None, - pull_rules=None, **kwargs, ): """Create a data stability monitoring html datastore for given pandas or spark dataframe. @@ -204,50 +148,12 @@ def df_stability_metrics( :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding]. default is 'self'. :param reference: reference dataframe or histograms. default is None - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) :param kwargs: residual keyword arguments, passed on to stability_report() :return: dict with results of metrics pipeline """ + if settings is None: + settings = Settings() + # basic checks on presence of time_axis if not (isinstance(time_axis, str) and len(time_axis) > 0) and not ( isinstance(time_axis, bool) and time_axis @@ -304,7 +210,6 @@ def df_stability_metrics( } bin_specs[time_axis] = time_specs - reference_hists = None if reference is not None: reference_type = "external" if isinstance(reference, dict): @@ -331,6 +236,7 @@ def df_stability_metrics( var_dtype, ret_specs=True, ) + kwargs["reference_hists"] = reference_hists # use the same features, bin_specs, time_axis, etc as for reference hists hists = make_histograms( @@ -345,13 +251,9 @@ def df_stability_metrics( # generate data stability report return stability_metrics( hists, - reference_type, - reference_hists, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, + settings=settings, + reference_type=reference_type, + time_axis=time_axis, + features=features, **kwargs, ) diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index 086dccce..601c5eb1 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -16,7 +16,7 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - +from typing import List, Union from ..alerting import ( AlertsSummary, @@ -40,7 +40,8 @@ RefMedianMadPullCalculator, RollingPullCalculator, ) -from ..base import Pipeline +from ..base import Module, Pipeline +from ..config import Settings from ..hist.hist_splitter import HistSplitter @@ -67,13 +68,11 @@ def get_metrics_pipeline_class(reference_type, reference): def create_metrics_pipeline( + settings: Settings, reference_type="self", reference=None, hists_key="hists", time_axis="", - window=10, - monitoring_rules={}, - pull_rules={}, features=None, **kwargs, ): @@ -81,10 +80,8 @@ def create_metrics_pipeline( cfg = { "hists_key": hists_key, "time_axis": time_axis, - "window": window, - "monitoring_rules": monitoring_rules, - "pull_rules": pull_rules, "features": features, + "settings": settings, **kwargs, } @@ -94,41 +91,143 @@ def create_metrics_pipeline( return pipeline +def get_splitting_modules( + hists_key, features, time_axis +) -> List[Union[Module, Pipeline]]: + """ + Splitting of test histograms. For each histogram with datetime i, comparison of histogram i with histogram i-1, + results in chi2 comparison of histograms + """ + modules: List[Union[Module, Pipeline]] = [ + HistSplitter( + read_key=hists_key, + store_key="split_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + HistProfiler(read_key="split_hists", store_key="profiles"), + ] + return modules + + +def get_traffic_light_modules(monitoring_rules) -> List[Union[Module, Pipeline]]: + """ + Expand all (wildcard) static traffic light bounds and apply them. + Applied to both profiles and comparisons datasets + """ + modules: List[Union[Module, Pipeline]] = [ + TrafficLightAlerts( + read_key="profiles", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds", + ), + TrafficLightAlerts( + read_key="comparisons", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds_comparisons", + ), + ApplyFunc( + apply_to_key="traffic_lights", + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + assign_to_key="alerts", + msg="Generating traffic light alerts summary.", + ), + AlertsSummary(read_key="alerts"), + ] + return modules + + +def get_static_bound_modules(pull_rules) -> List[Union[Module, Pipeline]]: + """ + generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, used for + plotting in popmon_profiles report. + """ + modules: List[Union[Module, Pipeline]] = [ + StaticBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + StaticBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + ] + return modules + + +def get_dynamic_bound_modules(pull_rules) -> List[Union[Module, Pipeline]]: + """ + Generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, used for + plotting in popmon_profiles report. + """ + modules: List[Union[Module, Pipeline]] = [ + DynamicBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + DynamicBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + ] + return modules + + +def get_trend_modules(window) -> List[Union[Module, Pipeline]]: + """Looking for significant rolling linear trends in selected features/metrics""" + modules: List[Union[Module, Pipeline]] = [ + ApplyFunc( + apply_to_key="profiles", + assign_to_key="comparisons", + apply_funcs=[ + { + "func": rolling_lr_zscore, + "suffix": f"_trend{window}_zscore", + "entire": True, + "window": window, + "metrics": ["mean", "phik", "fraction_true"], + } + ], + msg="Computing significance of (rolling) trend in means of features", + ), + ] + return modules + + class SelfReferenceMetricsPipeline(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, - features=None, - **kwargs, + settings: Settings, + hists_key, + time_axis, + features, ): """Example metrics pipeline for comparing test data with itself (full test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled self reference pipeline """ - from popmon.analysis.comparison.comparisons import Comparisons + from popmon.analysis.comparison import Comparisons - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + reference_prefix = "ref" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. Comparison of with profiled test histograms, results in chi2 comparison of histograms ReferenceHistComparer( reference_key="split_hists", @@ -141,11 +240,14 @@ def __init__( suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=[f"ref_{key}" for key in Comparisons.get_comparisons().keys()], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_keys() + if key in ["max_prob_diff", "psi", "jsd"] + ], ), # 4. profiling of histograms, then pull calculation compared with reference mean and std, # to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), RefMedianMadPullCalculator( reference_key="profiles", assign_to_key="profiles", @@ -153,97 +255,39 @@ def __init__( suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - StaticBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - StaticBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(settings.comparison.window) + + get_static_bound_modules(settings.monitoring.pull_rules) + + get_traffic_light_modules(settings.monitoring.monitoring_rules) + ) super().__init__(modules) class ExternalReferenceMetricsPipeline(Pipeline): def __init__( self, + settings: Settings, hists_key="test_hists", ref_hists_key="ref_hists", time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, features=None, - **kwargs, ): """Example metrics pipeline for comparing test data with other (full) external reference set :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled external reference pipeline """ - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + from popmon.analysis.comparison import Comparisons + + reference_prefix = "ref" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. Profiling of split reference histograms, then chi2 comparison with test histograms HistSplitter( read_key=ref_hists_key, @@ -256,17 +300,20 @@ def __init__( assign_to_key="split_hists", store_key="comparisons", ), + HistProfiler(read_key="split_ref_hists", store_key="ref_profiles"), RefMedianMadPullCalculator( reference_key="comparisons", assign_to_key="comparisons", suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=["ref_max_prob_diff"], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_keys() + if key in ["max_prob_diff", "psi", "jsd"] + ], ), # 4. pull calculation compared with reference mean and std, to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), - HistProfiler(read_key="split_ref_hists", store_key="ref_profiles"), ReferencePullCalculator( reference_key="ref_profiles", assign_to_key="profiles", @@ -274,103 +321,42 @@ def __init__( suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - StaticBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - StaticBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(settings.comparison.window) + + get_static_bound_modules(settings.monitoring.pull_rules) + + get_traffic_light_modules(settings.monitoring.monitoring_rules) + ) super().__init__(modules) class RollingReferenceMetricsPipeline(Pipeline): def __init__( self, + settings: Settings, hists_key="test_hists", time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, features=None, - **kwargs, ): """Example metrics pipeline for comparing test data with itself (rolling test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' - :param int window: size of rolling window and for trend detection. default is 10 - :param int shift: shift in rolling window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled rolling reference pipeline """ - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + from popmon.analysis.comparison import Comparisons + + reference_prefix = "roll" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. profiling of reference histograms, then comparison of with profiled test histograms # results in chi2 comparison of histograms RollingHistComparer( read_key="split_hists", - window=window, - shift=shift, + window=settings.comparison.window, + shift=settings.comparison.shift, store_key="comparisons", ), RefMedianMadPullCalculator( @@ -379,114 +365,59 @@ def __init__( suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=["roll_max_prob_diff"], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_keys() + if key in ["max_prob_diff", "psi", "jsd"] + ], ), # 4. profiling of histograms, then pull calculation compared with reference mean and std, # to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), RollingPullCalculator( read_key="profiles", - window=window, - shift=shift, + window=settings.comparison.window, + shift=settings.comparison.shift, suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - DynamicBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - DynamicBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(settings.comparison.window) + + get_dynamic_bound_modules(settings.monitoring.pull_rules) + + get_traffic_light_modules(settings.monitoring.monitoring_rules) + ) super().__init__(modules) class ExpandingReferenceMetricsPipeline(Pipeline): def __init__( self, + settings: Settings, hists_key="test_hists", time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, features=None, - **kwargs, ): """Example metrics pipeline for comparing test data with itself (expanding test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' - :param int window: window size for trend detection. default is 10 - :param int shift: shift in expanding window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled expanding reference pipeline """ - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + from popmon.analysis.comparison import Comparisons + + reference_prefix = "expanding" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. profiling of reference histograms, then comparison of with profiled test histograms # results in chi2 comparison of histograms ExpandingHistComparer( - read_key="split_hists", shift=shift, store_key="comparisons" + read_key="split_hists", + shift=settings.comparison.shift, + store_key="comparisons", ), # 4. profiling of histograms, then pull calculation compared with reference mean and std, # to obtain normalized residuals of profiles @@ -496,67 +427,26 @@ def __init__( suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=["expanding_max_prob_diff"], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_keys() + if key in ["max_prob_diff", "psi", "jsd"] + ], ), - HistProfiler(read_key="split_hists", store_key="profiles"), ExpandingPullCalculator( read_key="profiles", - shift=shift, + shift=settings.comparison.shift, suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - DynamicBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - DynamicBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(settings.comparison.window) + + get_dynamic_bound_modules(settings.monitoring.pull_rules) + + get_traffic_light_modules(settings.monitoring.monitoring_rules) + ) super().__init__(modules) diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index 87e22ff0..7039a73a 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -19,6 +19,7 @@ import logging +from typing import Optional import pandas as pd from histogrammar.dfinterface.make_histograms import ( @@ -27,7 +28,7 @@ make_histograms, ) -from ..config import config +from ..config import Report, Settings from ..pipeline.report_pipelines import ReportPipe, get_report_pipeline_class from ..resources import templates_env @@ -39,21 +40,11 @@ def stability_report( hists, + settings: Optional[Settings] = None, reference_type="self", reference=None, time_axis="", - window=10, - shift=1, - monitoring_rules=None, - pull_rules=None, features=None, - skip_empty_plots=True, - last_n=0, - plot_hist_n=2, - report_filepath=None, - extended_report=True, - show_stats=config["limited_stats"], - **kwargs, ): """Create a data stability monitoring html report for given dict of input histograms. @@ -62,69 +53,16 @@ def stability_report( default is 'self'. :param reference: histograms used as reference. default is None :param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided. - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) :param list features: histograms to pick up from the 'hists' dictionary (default is all keys) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param bool extended_report: if True, show all the generated statistics in the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments passed on to report pipeline. :return: dict with results of reporting pipeline """ + + if settings is None: + settings = Settings() + # perform basic input checks if not isinstance(hists, dict): raise TypeError("hists should be a dict of histogrammar histograms.") - if not isinstance(monitoring_rules, dict): - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - if not isinstance(pull_rules, dict): - pull_rules = {"*_pull": [7, 4, -4, -7]} - if (isinstance(time_axis, str) and len(time_axis) == 0) or ( isinstance(time_axis, bool) and time_axis ): @@ -132,42 +70,48 @@ def stability_report( first_cols = [k.split(":")[0] for k in list(hists.keys())] time_axis = max(set(first_cols), key=first_cols.count) - # if limited report is selected, check if stats list is provided, if not, get a default minimal list - show_stats = show_stats if not extended_report else None - # configuration and datastore for report pipeline cfg = { "hists_key": "hists", - "ref_hists_key": "ref_hists", "time_axis": time_axis, - "window": window, - "shift": shift, - "monitoring_rules": monitoring_rules, - "pull_rules": pull_rules, "features": features, - "skip_empty_plots": skip_empty_plots, - "last_n": last_n, - "plot_hist_n": plot_hist_n, - "report_filepath": report_filepath, - "show_stats": show_stats, - **kwargs, + "settings": settings, } datastore = {"hists": hists} if reference_type == "external": + cfg["ref_hists_key"] = "ref_hists" datastore["ref_hists"] = reference # execute reporting pipeline pipeline = get_report_pipeline_class(reference_type, reference)(**cfg) result = pipeline.transform(datastore) - stability_report = StabilityReport(datastore=result) - return stability_report + stability_report_result = StabilityReport(datastore=result) + return stability_report_result + + +def set_time_axis(df): + time_axes = get_time_axes(df) + num = len(time_axes) + if num == 1: + time_axis = time_axes[0] + logger.info(f'Time-axis automatically set to "{time_axis}"') + elif num == 0: + raise ValueError( + "No obvious time-axes found. Cannot generate stability report." + ) + else: + raise ValueError( + f"Found {num} time-axes: {time_axes}. Set *one* time_axis manually!" + ) + return time_axis def df_stability_report( df, time_axis, + settings: Settings = None, features=None, binning="auto", bin_specs=None, @@ -176,17 +120,6 @@ def df_stability_report( var_dtype=None, reference_type="self", reference=None, - window=10, - shift=1, - monitoring_rules=None, - pull_rules=None, - skip_empty_plots=True, - last_n=0, - plot_hist_n=2, - report_filepath=None, - extended_report=True, - show_stats=config["limited_stats"], - **kwargs, ): """Create a data stability monitoring html report for given pandas or spark dataframe. @@ -235,56 +168,12 @@ def df_stability_report( :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding]. default is 'self'. :param reference: reference dataframe or histograms. default is None - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param bool extended_report: if True, show all the generated statistics in the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments, passed on to stability_report() :return: dict with results of reporting pipeline """ + + if settings is None: + settings = Settings() + # basic checks on presence of time_axis if not (isinstance(time_axis, str) and len(time_axis) > 0) and not ( isinstance(time_axis, bool) and time_axis @@ -298,19 +187,8 @@ def df_stability_report( f'time_axis "{time_axis}" not found in columns of reference dataframe.' ) if isinstance(time_axis, bool): - time_axes = get_time_axes(df) - num = len(time_axes) - if num == 1: - time_axis = time_axes[0] - logger.info(f'Time-axis automatically set to "{time_axis}"') - elif num == 0: - raise ValueError( - "No obvious time-axes found. Cannot generate stability report." - ) - else: - raise ValueError( - f"Found {num} time-axes: {time_axes}. Set *one* time_axis manually!" - ) + time_axis = set_time_axis(df) + if features is not None: # by now time_axis is defined. ensure that all histograms start with it. if not isinstance(features, list): @@ -381,22 +259,12 @@ def df_stability_report( # generate data stability report return stability_report( - hists, - reference_type, - reference_hists, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, - skip_empty_plots, - last_n, - plot_hist_n, - report_filepath, - extended_report, - show_stats, - **kwargs, + hists=hists, + settings=settings, + reference_type=reference_type, + reference=reference_hists, + time_axis=time_axis, + features=features, ) @@ -476,29 +344,14 @@ def to_notebook_iframe(self, width="100%", height="100%"): def regenerate( self, - last_n=0, - skip_first_n=0, - skip_last_n=0, - plot_hist_n=2, - skip_empty_plots=True, - report_filepath=None, - store_key="html_report", - sections_key="report_sections", - extended_report=True, - show_stats=config["limited_stats"], + store_key: str = "html_report", + sections_key: str = "report_sections", + report_settings: Report = None, ): """Regenerate HTML report with different plot settings - - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: in plot skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param str report_filepath: the file path where to output the report (optional) :param str sections_key: key to store sections data in the datastore. default is 'report_sections'. :param str store_key: key to store the HTML report data in the datastore. default is 'html_report' - :param bool extended_report: if True, show all the generated statistics in the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + :param Report report_settings: configuration to regenerate the report :return HTML: HTML report in an iframe """ # basic checks @@ -511,19 +364,12 @@ def regenerate( del self.datastore[sections_key] if store_key in self.datastore: del self.datastore[store_key] - - # if limited report is selected, check if stats list is provided, if not, get a default minimal list - show_stats = show_stats if not extended_report else None + if report_settings is None: + report_settings = Report() pipeline = ReportPipe( sections_key=sections_key, - last_n=last_n, - skip_first_n=skip_first_n, - skip_last_n=skip_last_n, - skip_empty_plots=skip_empty_plots, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, + settings=report_settings, ) result = pipeline.transform(self.datastore) diff --git a/popmon/pipeline/report_pipelines.py b/popmon/pipeline/report_pipelines.py index d6d2408c..8373ec86 100644 --- a/popmon/pipeline/report_pipelines.py +++ b/popmon/pipeline/report_pipelines.py @@ -21,7 +21,7 @@ from pathlib import Path from ..base import Pipeline -from ..config import config +from ..config import Report, Settings from ..io import FileWriter from ..pipeline.metrics_pipelines import ( ExpandingReferenceMetricsPipeline, @@ -58,61 +58,29 @@ def get_report_pipeline_class(reference_type, reference): class SelfReference(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, - features=None, - skip_empty_plots=True, - last_n=0, - top_n=20, - plot_hist_n=6, - report_filepath=None, - show_stats=None, - disable_heatmap=None, - cmap=None, - **kwargs, + settings: Settings, + features: list, + hists_key: str = "test_hists", + time_axis: str = "date", ): """Example pipeline for comparing test data with itself (full test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param list disable_heatmap: list of heatmap types to disable in the report. 'normal' to disable normal heatmap, 'row' to row normalized, 'column' to disable column normalized. If None, show all (optional) - :param kwargs: residual keyword arguments :return: assembled self reference pipeline """ modules = [ SelfReferenceMetricsPipeline( - hists_key, - time_axis, - window, - monitoring_rules, - pull_rules, - features, - **kwargs, + hists_key=hists_key, + time_axis=time_axis, + features=features, + settings=settings, ), ReportPipe( sections_key="report_sections", store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - top_n=top_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - disable_heatmap=disable_heatmap, - cmap=cmap, + settings=settings.report, ), ] @@ -122,64 +90,33 @@ def __init__( class ExternalReference(Pipeline): def __init__( self, - hists_key="test_hists", - ref_hists_key="ref_hists", - time_axis="date", - window=10, - monitoring_rules={}, - pull_rules={}, + settings: Settings, + hists_key: str = "test_hists", + ref_hists_key: str = "ref_hists", + time_axis: str = "date", features=None, - skip_empty_plots=True, - last_n=0, - top_n=20, - plot_hist_n=2, - report_filepath=None, - show_stats=None, - disable_heatmap=None, - cmap=None, - **kwargs, ): """Example pipeline for comparing test data with other (full) external reference set :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param list disable_heatmap: list of heatmap types to disable in the report. 'normal' to disable normal heatmap, 'row' to row normalized, 'column' to disable column normalized. If None, show all (optional) :param kwargs: residual keyword arguments :return: assembled external reference pipeline """ modules = [ ExternalReferenceMetricsPipeline( - hists_key, - ref_hists_key, - time_axis, - window, - monitoring_rules, - pull_rules, - features, - **kwargs, + hists_key=hists_key, + ref_hists_key=ref_hists_key, + time_axis=time_axis, + features=features, + settings=settings, ), ReportPipe( sections_key="report_sections", store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - top_n=top_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - disable_heatmap=disable_heatmap, - cmap=cmap, + settings=settings.report, ), ] @@ -189,64 +126,29 @@ def __init__( class RollingReference(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, + settings: Settings, + hists_key: str = "test_hists", + time_axis: str = "date", features=None, - skip_empty_plots=True, - last_n=0, - top_n=20, - plot_hist_n=6, - report_filepath=None, - show_stats=None, - disable_heatmap=None, - cmap=None, - **kwargs, ): """Example pipeline for comparing test data with itself (rolling test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: size of rolling window and for trend detection. default is 10 - :param int shift: shift in rolling window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param list disable_heatmap: list of heatmap types to disable in the report. 'normal' to disable normal heatmap, 'row' to row normalized, 'column' to disable column normalized. If None, show all (optional) - :param kwargs: residual keyword arguments :return: assembled rolling reference pipeline """ modules = [ RollingReferenceMetricsPipeline( - hists_key, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, - **kwargs, + settings=settings, + hists_key=hists_key, + time_axis=time_axis, + features=features, ), ReportPipe( sections_key="report_sections", store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - top_n=top_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - disable_heatmap=disable_heatmap, - cmap=cmap, + settings=settings.report, ), ] @@ -256,64 +158,29 @@ def __init__( class ExpandingReference(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", - window=10, - shift=1, - monitoring_rules={}, - pull_rules={}, + settings: Settings, + hists_key: str = "test_hists", + time_axis: str = "date", features=None, - skip_empty_plots=True, - last_n=0, - top_n=20, - plot_hist_n=6, - report_filepath=None, - show_stats=None, - disable_heatmap=None, - cmap="autumn_r", - **kwargs, ): """Example pipeline for comparing test data with itself (expanding test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param int shift: shift in expanding window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param list disable_heatmap: list of heatmap types to disable in the report. 'normal' to disable normal heatmap, 'row' to row normalized, 'column' to disable column normalized. If None, show all (optional) - :param kwargs: residual keyword arguments :return: assembled expanding reference pipeline """ modules = [ ExpandingReferenceMetricsPipeline( - hists_key, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, - **kwargs, + hists_key=hists_key, + time_axis=time_axis, + features=features, + settings=settings, ), ReportPipe( sections_key="report_sections", store_key="html_report", - skip_empty_plots=skip_empty_plots, - last_n=last_n, - top_n=top_n, - plot_hist_n=plot_hist_n, - report_filepath=report_filepath, - show_stats=show_stats, - disable_heatmap=disable_heatmap, - cmap=cmap, + settings=settings.report, ), ] @@ -325,116 +192,72 @@ class ReportPipe(Pipeline): def __init__( self, - sections_key="report_sections", - store_key="html_report", - profiles_section="Profiles", - comparisons_section="Comparisons", - traffic_lights_section="Traffic Lights", - alerts_section="Alerts", - histograms_section="Histograms", - report_filepath=None, - show_stats=None, - skip_empty_plots=True, - last_n=0, - top_n=20, - skip_first_n=0, - skip_last_n=0, - plot_hist_n=6, - disable_heatmap=None, - cmap=None, + settings: Report, + sections_key: str = "report_sections", + store_key: str = "html_report", ): """Initialize an instance of Report. :param str sections_key: key to store sections data in the datastore :param str store_key: key to store the HTML report data in the datastore - :param str profiles_section: name for the profile data section. default is 'Profiles' - :param str comparisons_section: name for the comparison data section. default is 'Comparisons' - :param str traffic_lights_section: name for the traffic light section. default is 'Traffic Lights' - :param str alerts_section: name for the alerts section. default is 'Alerts' - :param str histograms_section: name for the histograms section. default is 'Histograms' - :param str report_filepath: the file path where to output the report (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: when plotting data skip last 'n' periods. last_n takes precedence (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param list disable_heatmap: list of heatmap types to disable in the report. 'normal' to disable normal heatmap, 'row' to row normalized, 'column' to disable column normalized. If None, show all (optional) """ self.store_key = store_key - # dictionary of section descriptions - descs = config["section_descriptions"] - - # default keyword arguments for each section - sg_kws = { - "store_key": sections_key, - "skip_empty_plots": skip_empty_plots, - "last_n": last_n, - "skip_first_n": skip_first_n, - "skip_last_n": skip_last_n, - "show_stats": show_stats, - } - modules = [ OverviewSectionGenerator( read_key="traffic_lights", - description=descs.get("overview", ""), - section_name="Overview", - **sg_kws, + store_key=sections_key, + settings=settings, ), # generate section with histogram HistogramSection( read_key="split_hists", store_key=sections_key, - section_name=histograms_section, hist_name_starts_with="histogram", - last_n=plot_hist_n, - top_n=top_n, - description=descs.get("histograms", ""), - disable_heatmap=disable_heatmap, - cmap=cmap, + settings=settings.section.histograms, ), # section showing all traffic light alerts of monitored statistics TrafficLightSectionGenerator( read_key="traffic_lights", - description=descs.get("traffic_lights", ""), - section_name=traffic_lights_section, - **sg_kws, + store_key=sections_key, + settings=settings, ), # section with a summary of traffic light alerts AlertSectionGenerator( read_key="alerts", - description=descs.get("alerts", ""), - section_name=alerts_section, - **sg_kws, + store_key=sections_key, + settings=settings, ), # section of histogram and pull comparison statistics SectionGenerator( dynamic_bounds="dynamic_bounds_comparisons", static_bounds="static_bounds_comparisons", - section_name=comparisons_section, + section_name=settings.section.comparisons.name, ignore_stat_endswith=["_mean", "_std", "_pull"], read_key="comparisons", - description=descs.get("comparisons", ""), - **sg_kws, + description=settings.section.comparisons.description, + store_key=sections_key, + settings=settings, ), # section of profiled statistics with dynamic or static traffic light bounds SectionGenerator( dynamic_bounds="dynamic_bounds", - section_name=profiles_section, + section_name=settings.section.profiles.name, static_bounds="static_bounds", ignore_stat_endswith=["_mean", "_std", "_pull"], read_key="profiles", - description=descs.get("profiles", ""), - **sg_kws, + description=settings.section.profiles.description, + store_key=sections_key, + settings=settings, ), # generate report ReportGenerator(read_key=sections_key, store_key=store_key), ] - if isinstance(report_filepath, (str, Path)) and len(report_filepath) > 0: - modules.append(FileWriter(store_key, file_path=report_filepath)) + if ( + isinstance(settings.report_filepath, (str, Path)) + and len(settings.report_filepath) > 0 + ): + modules.append(FileWriter(store_key, file_path=settings.report_filepath)) super().__init__(modules=modules) diff --git a/popmon/stats/numpy.py b/popmon/stats/numpy.py index 6494d5a9..16c81529 100644 --- a/popmon/stats/numpy.py +++ b/popmon/stats/numpy.py @@ -18,68 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import warnings - import numpy as np import pandas as pd -from scipy import stats - -from popmon.analysis.comparison.comparisons import Comparisons - - -def fraction_of_true(bin_labels, bin_entries): - """Compute fraction of 'true' labels - - :param bin_labels: Array containing numbers whose mean is desired. If `a` is not an - array, a conversion is attempted. - :param bin_entries: Array containing weights for the elements of `a`. If `weights` is not an - array, a conversion is attempted. - :return: fraction of 'true' labels - """ - bin_labels = np.array(bin_labels) - bin_entries = np.array(bin_entries) - assert len(bin_labels) == len(bin_entries) - - def replace(bl): - if bl in {"True", "true"}: - return True - elif bl in {"False", "false"}: - return False - return np.nan - - # basic checks: dealing with boolean labels - # also accept strings of 'True' and 'False' - if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: - return np.nan - if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): - if not np.all( - [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] - ): - return np.nan - # all strings from hereon - n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() - n_false = (bin_labels == "False").sum() + (bin_labels == "false").sum() - n_nan = ( - (bin_labels == "NaN").sum() - + (bin_labels == "nan").sum() - + (bin_labels == "None").sum() - + (bin_labels == "none").sum() - + (bin_labels == "Null").sum() - + (bin_labels == "null").sum() - ) - if n_true + n_false + n_nan != len(bin_labels): - return np.nan - # convert string to boolean - bin_labels = np.array([replace(bl) for bl in bin_labels]) - - sum_true = np.sum([be for bl, be in zip(bin_labels, bin_entries) if bl]) - sum_false = np.sum([be for bl, be in zip(bin_labels, bin_entries) if not bl]) - sum_entries = sum_true + sum_false - if sum_entries == 0: - # all nans scenario - return np.nan - # exclude nans from fraction - return (1.0 * sum_true) / sum_entries def mean(a, weights=None, axis=None, dtype=None, keepdims=False, ddof=0): @@ -217,179 +157,6 @@ def quantile(a, q, weights=None, axis=None, keepdims: bool = False): return y -def _not_finite_to_zero(x): - res = x.copy() - res[~np.isfinite(res)] = 0 - return res - - -def uu_chi2(n, m, verbose=False): - """Normalized Chi^2 formula for two histograms with different number of entries - - Copyright ROOT: - Formulas translated from c++ to python, but formulas otherwise not modified. - Reference: https://root.cern.ch/doc/master/classTH1.html#a6c281eebc0c0a848e7a0d620425090a5 - GNU License: https://root.cern.ch/license - All modifications copyright ING WBAA. - - :param n: 1d array with bin counts of the reference set - :param m: 1d array with bin counts of the test set - :param bool verbose: if true, print warnings in case of empty histograms - :return: tuple of floats (chi2_value, chi2_norm, z_score, p_value, res) - """ - if len(n) == 0 or len(m) == 0: - raise ValueError("Input histogram(s) has zero size.") - if len(n) != len(m): - raise ValueError("Input histograms have unequal size.") - - N = np.sum(n) - M = np.sum(m) - - if N == 0 or M == 0: - if verbose: - warnings.warn( - "Input histogram(s) is empty and cannot be renormalized. Chi2 is undefined." - ) - return np.nan, np.nan, np.nan, np.nan, [0] * len(n) - - # remove all zero entries in the sum, to present division by zero for individual bins - z = n + m - n = n[z != 0] - m = m[z != 0] - - dof = ((n != 0) | (m != 0)).sum() - 1 - chi2_value = _not_finite_to_zero(((M * n - N * m) ** 2) / (n + m)).sum() / M / N - - chi2_norm = chi2_value / dof if dof > 0 else np.nan - p_value = stats.chi2.sf(chi2_value, dof) - z_score = -stats.norm.ppf(p_value) - - p = (n + m) / (N + M) - - if (p == 1).any(): - # unusual case of (only) one bin with p==1, avoids division with zero below - res = np.array([np.nan] * len(p)) - else: - res = _not_finite_to_zero( - (n - N * p) / np.sqrt(N * p) / np.sqrt((1 - N / (N + M)) * (1 - p)) - ) - - return chi2_value, chi2_norm, z_score, p_value, res - - -def ks_test(hist_1, hist_2): - """KS-test for two histograms with different number of entries - - Copyright ROOT: - Formulas translated from c++ to python, but formulas otherwise not modified. - Reference: link: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest - GNU license: https://root.cern.ch/license - All modifications copyright ING WBAA. - - :param hist_1: 1D array with bin counts of the histogram_1 - :param hist_2: 1D array with bin counts of the histogram_2 - - :return: ks_score: Kolmogorov-Smirnov Test score - :rtype: float - """ - if len(hist_1) == 0 or len(hist_2) == 0: - raise ValueError("Input histogram(s) has zero size.") - if len(hist_1) != len(hist_2): - raise ValueError("Input histograms have unequal size.") - - sum_1 = np.sum(hist_1) - sum_2 = np.sum(hist_2) - if sum_1 == 0 or sum_2 == 0: - return np.nan - - normalized_cumsum_1 = np.cumsum(hist_1) / sum_1 - normalized_cumsum_2 = np.cumsum(hist_2) / sum_2 - - d = np.abs(normalized_cumsum_1 - normalized_cumsum_2) - - return np.max(d) * np.sqrt(sum_1 * sum_2 / (sum_1 + sum_2)) - - -def ks_prob(testscore): - """KS-probability corresponding ti KS test score - - Copyright ROOT: - Formulas translated from c++ to python, but formulas otherwise not modified. - Reference: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest - GNU license: https://root.cern.ch/license - All modifications copyright ING WBAA. - - :param float testscore: Kolmogorov-Smirnov test score - - :return: approximate pvalue for the Kolmogorov-Smirnov test score - :rtype: float - """ - fj = np.array([-2, -8, -18, -32]) - r = np.zeros(4) - - w = 2.50662827 - c = np.array([-1.2337005501361697, -11.103304951225528, -30.842513753404244]) - - u = abs(testscore) - pvalue = np.nan - if u < 0.2: - pvalue = 1 - elif u < 0.755: - v = np.power(u, -2) - pvalue = 1 - w * np.exp(c * v).sum() / u - elif u < 6.8116: - v = np.power(u, 2) - max_j = int(max(1, round(3.0 / u))) - r[:max_j] = np.exp(fj[:max_j] * v) - pvalue = 2 * (r[0] - r[1] + r[2] - r[3]) - - return pvalue - - -@Comparisons.register( - key="max_prob_diff", - description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})", -) -def googl_test(bins_1, bins_2): - """Google-paper test - - Reference link: https://mlsys.org/Conferences/2019/doc/2019/167.pdf - - :param bins_1: first array of bin entries - :param bins_2: second array of entries - - :return: maximum difference between the two entry distributions - :rtype: float - """ - - def dist(bins): - sum_ = np.sum(bins) - return bins / sum_ if sum_ else bins - - return np.max(np.abs(dist(bins_1) - dist(bins_2))) - - -@Comparisons.register(key="psi", description="Population Stability Index") -def population_stability_index(p, q): - epsilon = 10e-6 - p += epsilon - q += epsilon - return np.sum((p - q) * np.log(p / q)) - - -def kullback_leibler_divergence(p, q): - epsilon = 10e-6 - p += epsilon - q += epsilon - return np.sum(p * np.log(p / q)) - - -@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence") -def jensen_shannon_divergence(p, q): - m = 0.5 * (p + q) - return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m)) - - def probability_distribution_mean_covariance(entries_list): """Mean normalized histogram and covariance of list of input histograms diff --git a/popmon/visualization/alert_section_generator.py b/popmon/visualization/alert_section_generator.py index 710eb886..c28c0811 100644 --- a/popmon/visualization/alert_section_generator.py +++ b/popmon/visualization/alert_section_generator.py @@ -20,14 +20,12 @@ from typing import Optional -import numpy as np import pandas as pd from tqdm import tqdm from ..base import Module -from ..config import get_stat_description -from ..utils import filter_metrics, parallel, short_date -from ..visualization.utils import _prune, plot_bars_b64 +from ..config import Report +from ..utils import filter_metrics, short_date from .traffic_light_section_generator import _plot_metrics @@ -44,39 +42,26 @@ def __init__( self, read_key, store_key, - section_name, + settings: Report, features=None, ignore_features=None, - last_n=0, - skip_first_n=0, - skip_last_n=0, static_bounds=None, dynamic_bounds=None, prefix="traffic_light_", suffices=["_red_high", "_yellow_high", "_yellow_low", "_red_low"], ignore_stat_endswith=None, - skip_empty_plots=True, - description="", - show_stats=None, ): """Initialize an instance of SectionGenerator. :param str read_key: key of input data to read from the datastore and use for plotting :param str store_key: key for output data to be stored in the datastore - :param str section_name: key of output data to store in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) :param str static_bounds: key to static traffic light bounds key in datastore (optional) :param str dynamic_bounds: key to dynamic traffic light bounds key in datastore (optional) :param str prefix: dynamic traffic light prefix. default is ``'traffic_light_'`` (optional) :param str suffices: dynamic traffic light suffices. (optional) :param list ignore_stat_endswith: ignore stats ending with any of list of suffices. (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param str description: description of the section. default is empty (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) """ super().__init__() self.read_key = read_key @@ -86,18 +71,19 @@ def __init__( self.features = features or [] self.ignore_features = ignore_features or [] - self.section_name = section_name - self.last_n = last_n - self.skip_first_n = skip_first_n - self.skip_last_n = skip_last_n self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] - self.skip_empty_plots = skip_empty_plots - self.description = description - self.show_stats = show_stats - self.plot_overview = True - self.plot_metrics = False + + self.last_n = settings.last_n + self.skip_first_n = settings.skip_first_n + self.skip_last_n = settings.skip_last_n + self.skip_empty_plots = settings.skip_empty_plots + self.show_stats = settings.show_stats if not settings.extended_report else None + + self.section_name = settings.section.alerts.name + self.description = settings.section.alerts.description + self.descriptions = settings.section.alerts.descriptions def get_description(self): return self.section_name @@ -145,40 +131,20 @@ def transform( df.columns, self.ignore_stat_endswith, self.show_stats ) - plots = [] - if self.plot_overview: - plots.append( - _plot_metrics( - feature, - [m for m in metrics if not m.endswith("worst")], - dates, - df, - 0, - 0, - 0, - 0, - style="alerts", - ) + plots = [ + _plot_metrics( + feature, + metrics, + dates, + df, + 0, + 0, + 0, + 0, + style="alerts", ) - if self.plot_metrics: - args = [ - ( - feature, - metric, - dates, - df[metric], - static_bounds, - fdbounds, - self.prefix, - self.suffices, - self.last_n, - self.skip_first_n, - self.skip_last_n, - self.skip_empty_plots, - ) - for metric in metrics - ] - plots += parallel(_plot_metric, args) + ] + # filter out potential empty plots (from skip empty plots) if self.skip_empty_plots: plots = [e for e in plots if len(e["plot"])] @@ -195,46 +161,3 @@ def transform( } ) return sections - - -def _plot_metric( - feature, - metric, - dates, - values, - static_bounds, - fdbounds, - prefix, - suffices, - last_n, - skip_first_n, - skip_last_n, - skip_empty, -): - """Split off plot histogram generation to allow for parallel processing""" - # pick up static traffic light boundaries - name = feature + ":" + metric - sbounds = static_bounds.get(name, ()) - # pick up dynamic traffic light boundaries - names = [prefix + metric + suffix for suffix in suffices] - dbounds = tuple( - _prune(fdbounds[n].tolist(), last_n, skip_first_n, skip_last_n) - for n in names - if n in fdbounds.columns - ) - # choose dynamic bounds if present - bounds = dbounds if len(dbounds) > 0 else sbounds - # prune dates and values - dates = _prune(dates, last_n, skip_first_n, skip_last_n) - values = _prune(values, last_n, skip_first_n, skip_last_n) - - # make plot. note: slow! - plot = plot_bars_b64( - data=np.array(values), - labels=dates, - ylim=True, - bounds=bounds, - skip_empty=skip_empty, - ) - - return {"name": metric, "description": get_stat_description(metric), "plot": plot} diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index bfa41871..a54b4e22 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -31,7 +31,7 @@ get_consistent_numpy_entries, ) from ..base import Module -from ..config import get_stat_description +from ..config import HistogramSectionModel from ..utils import parallel, short_date from ..visualization.utils import plot_heatmap_b64, plot_overlay_1d_histogram_b64 @@ -46,29 +46,20 @@ def __init__( self, read_key, store_key, - section_name="Histograms", + settings: HistogramSectionModel, features=None, ignore_features=None, - last_n=1, - top_n=20, hist_names=None, hist_name_starts_with="histogram", - description="", - disable_heatmap=None, - cmap="autumn_r", ): """Initialize an instance of SectionGenerator. :param str read_key: key of input data to read from the datastore and use for plotting :param str store_key: key for output data to be stored in the datastore - :param str section_name: key of output data to store in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int last_n: plot histogram for last 'n' periods. default is 1 (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) :param list hist_names: list of histogram names to plot :param str hist_name_starts_with: find histograms in case hist_names is empty. default is histogram. - :param str description: description of the section. default is empty (optional) """ super().__init__() self.read_key = read_key @@ -76,16 +67,18 @@ def __init__( self.features = features or [] self.ignore_features = ignore_features or [] - self.section_name = section_name - self.last_n = last_n if last_n >= 0 else 0 - self.top_n = top_n if top_n >= 1 else 20 self.hist_names = hist_names or [] self.hist_name_starts_with = hist_name_starts_with - self.description = description - self.disable_heatmap = disable_heatmap or [] - if cmap is None: - cmap = "autumn_r" - self.cmap = cmap + + # section specific + self.section_name = settings.name + self.descriptions = settings.descriptions + self.description = settings.description + self.hist_names = settings.hist_names + self.hist_names_formatted = settings.hist_names_formatted + self.plot_hist_n = settings.plot_hist_n + self.top_n = settings.top_n + self.cmap = settings.cmap def get_description(self): return self.section_name @@ -101,7 +94,9 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): for feature in tqdm(features, ncols=100): df = data_obj.get(feature, pd.DataFrame()) - last_n = len(df.index) if len(df.index) < self.last_n else self.last_n + last_n = ( + len(df.index) if len(df.index) < self.plot_hist_n else self.plot_hist_n + ) hist_names = [hn for hn in self.hist_names if hn in df.columns] if len(hist_names) == 0 and len(self.hist_name_starts_with) > 0: # if no columns are given, find histogram columns. @@ -127,8 +122,10 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): dates, [h[0] for h in hists], self.top_n, - self.disable_heatmap, self.cmap, + self.hist_names, + self.hist_names_formatted, + self.descriptions, ) # get base64 encoded plot for each metric; do parallel processing to speed up. @@ -148,11 +145,8 @@ def transform(self, data_obj: dict, sections: Optional[list] = None): plots = sorted(plots, key=lambda plot: plot["name"]) # filter out potential empty heatmap plots, then prepend them to the sorted histograms - hplots = [] - for h in heatmaps: - if isinstance(h, dict): - if len(h["plot"]): - hplots.append(h) + hplots = [h for h in heatmaps if isinstance(h, dict) and len(h["plot"])] + plots = hplots + plots features_w_metrics.append({"name": feature, "plots": plots}) @@ -188,14 +182,14 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): hist_names = [hn for i, hn in enumerate(hist_names) if i not in none_hists] # more basic checks if len(hc_list) == 0: - return {"name": date, "description": get_stat_description(date), "plot": ""} + return {"name": date, "description": "", "plot": ""} assert_similar_hists(hc_list) # make plot. note: slow! if hc_list[0].n_dim == 1: if all(h.size == 0 for h in hc_list): # triviality checks, skip all histograms empty - return {"name": date, "description": get_stat_description(date), "plot": ""} + return {"name": date, "description": "", "plot": ""} props = get_hist_props(hc_list[0]) is_num = props["is_num"] @@ -214,7 +208,7 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): # skip histograms with too many bins to plot (default more than 1000) if len(bins) > max_nbins: - return {"name": date, "description": get_stat_description(date), "plot": ""} + return {"name": date, "description": "", "plot": ""} # normalize histograms for plotting (comparison!) in case there is more than one. if len(hc_list) >= 2: @@ -240,31 +234,19 @@ def _plot_histograms(feature, date, hc_list, hist_names, top_n, max_nbins=1000): else: plot = "" - return {"name": date, "description": get_stat_description(date), "plot": plot} - - -def _plot_heatmap(feature, date, hc_list, top_n, disable_heatmap, cmap): - hist_names = [ - "heatmap", - "heatmap_column_normalized", - "heatmap_row_normalized", - ] - hist_names_formatted = { - "heatmap": "Heatmap", - "heatmap_column_normalized": "Column-Normalized Heatmap", - "heatmap_row_normalized": "Row-Normalized Heatmap", - } - - for d in disable_heatmap: - if d == "normal": - hist_names.remove("heatmap") - elif d == "row": - hist_names.remove("heatmap_row_normalized") - elif d == "column": - hist_names.remove("heatmap_column_normalized") - else: - raise ValueError("Invalid argument in disable_heatmap: ", d) + return {"name": date, "description": "", "plot": plot} + +def _plot_heatmap( + feature, + date, + hc_list, + top_n, + cmap, + hist_names, + hist_names_formatted, + descriptions, +): # basic checks if len(hist_names) <= 0: # skip numeric heatmap @@ -301,12 +283,12 @@ def _plot_heatmap(feature, date, hc_list, top_n, disable_heatmap, cmap): # make 3 copies : 1st normal, 2nd for column normalized heatmap, 3rd for row normalized heatmap hists = [] - if "normal" not in disable_heatmap: + if "heatmap" in hist_names: hist_normal = entries_list.copy() hists.append(hist_normal) # normalize across column for a plot - if "column" not in disable_heatmap: + if "heatmap_column_normalized" in hist_names: hist_col = entries_list.copy() hist_col = np.stack(hist_col, axis=1) hist_col = hist_col.astype(float) @@ -317,7 +299,7 @@ def _plot_heatmap(feature, date, hc_list, top_n, disable_heatmap, cmap): hists.append(hist_col) # normalize across row for a plot - if "row" not in disable_heatmap: + if "heatmap_row_normalized" in hist_names: hist_row = entries_list.copy() hist_row = hist_row.astype(float) for i in range(hist_row.shape[0]): @@ -344,7 +326,7 @@ def _plot_heatmap(feature, date, hc_list, top_n, disable_heatmap, cmap): plots = [ { "name": hist_names_formatted[hist_name], - "description": get_stat_description(hist_name), + "description": descriptions[hist_name], "plot": pl, "full_width": True, } diff --git a/popmon/visualization/overview_section.py b/popmon/visualization/overview_section.py index 7afe4b7a..da268a88 100644 --- a/popmon/visualization/overview_section.py +++ b/popmon/visualization/overview_section.py @@ -25,6 +25,7 @@ from tqdm import tqdm from ..base import Module +from ..config import Report from ..resources import templates_env from ..utils import filter_metrics from ..visualization.utils import _prune @@ -43,41 +44,26 @@ def __init__( self, read_key, store_key, - section_name, + settings: Report, features=None, ignore_features=None, - last_n=0, - skip_first_n=0, - skip_last_n=0, static_bounds=None, dynamic_bounds=None, prefix="traffic_light_", suffices=["_red_high", "_yellow_high", "_yellow_low", "_red_low"], ignore_stat_endswith=None, - skip_empty_plots=True, - description="", - show_stats=None, ): """Initialize an instance of SectionGenerator. :param str read_key: key of input data to read from the datastore and use for plotting :param str store_key: key for output data to be stored in the datastore - :param str section_name: key of output data to store in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) :param str static_bounds: key to static traffic light bounds key in datastore (optional) :param str dynamic_bounds: key to dynamic traffic light bounds key in datastore (optional) :param str prefix: dynamic traffic light prefix. default is ``'traffic_light_'`` (optional) :param str suffices: dynamic traffic light suffices. (optional) :param list ignore_stat_endswith: ignore stats ending with any of list of suffices. (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param str description: description of the section. default is empty (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param bool plot_overview: heatmap overview of traffic lights (features x time) - :param bool plot_metrics: individual plot per feature """ super().__init__() self.read_key = read_key @@ -87,16 +73,17 @@ def __init__( self.features = features or [] self.ignore_features = ignore_features or [] - self.section_name = section_name - self.last_n = last_n - self.skip_first_n = skip_first_n - self.skip_last_n = skip_last_n self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] - self.skip_empty_plots = skip_empty_plots - self.description = description - self.show_stats = show_stats + + self.last_n = settings.last_n + self.skip_first_n = settings.skip_first_n + self.skip_last_n = settings.skip_last_n + self.skip_empty_plots = settings.skip_empty_plots + self.show_stats = settings.show_stats if not settings.extended_report else None + self.section_name = settings.section.overview.name + self.description = settings.section.overview.description def get_description(self): return self.section_name diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index f06fbc10..120523cb 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -24,11 +24,48 @@ import pandas as pd from tqdm import tqdm +from popmon.analysis.comparison import Comparisons +from popmon.analysis.profiling import Profiles + from ..base import Module -from ..config import get_stat_description +from ..config import Report from ..utils import filter_metrics, parallel, short_date from ..visualization.utils import _prune, plot_bars_b64 +profiles = Profiles.get_descriptions() + +comparisons = Comparisons.get_descriptions() + + +references = { + "ref": "the reference data", + "roll": "a rolling window", + "prev1": "the preceding time slot", + "expanding": "all preceding time slots", +} + + +def get_stat_description(name: str): + """Gets the description of a statistic. + + :param str name: the name of the statistic. + + :returns str: the description of the statistic. If not found, returns an empty string + """ + if not isinstance(name, str): + raise TypeError("Statistic's name should be a string.") + + if name in profiles: + return profiles[name] + + head, *tail = name.split("_") + tail = "_".join(tail) + + if tail in comparisons and head in references: + return comparisons[tail].format(ref=references[head]) + + return "" + class SectionGenerator(Module): """This module takes the time-series data of already computed statistics, plots the data and @@ -44,19 +81,15 @@ def __init__( read_key, store_key, section_name, + settings: Report, features=None, ignore_features=None, - last_n=0, - skip_first_n=0, - skip_last_n=0, static_bounds=None, dynamic_bounds=None, prefix="traffic_light_", suffices=["_red_high", "_yellow_high", "_yellow_low", "_red_low"], ignore_stat_endswith=None, - skip_empty_plots=True, description="", - show_stats=None, ): """Initialize an instance of SectionGenerator. @@ -65,17 +98,12 @@ def __init__( :param str section_name: key of output data to store in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) :param str static_bounds: key to static traffic light bounds key in datastore (optional) :param str dynamic_bounds: key to dynamic traffic light bounds key in datastore (optional) :param str prefix: dynamic traffic light prefix. default is ``'traffic_light_'`` (optional) :param str suffices: dynamic traffic light suffices. (optional) :param list ignore_stat_endswith: ignore stats ending with any of list of suffices. (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) :param str description: description of the section. default is empty (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) """ super().__init__() self.read_key = read_key @@ -86,15 +114,15 @@ def __init__( self.features = features or [] self.ignore_features = ignore_features or [] self.section_name = section_name - self.last_n = last_n - self.skip_first_n = skip_first_n - self.skip_last_n = skip_last_n + self.last_n = settings.last_n + self.skip_first_n = settings.skip_first_n + self.skip_last_n = settings.skip_last_n self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] - self.skip_empty_plots = skip_empty_plots + self.skip_empty_plots = settings.skip_empty_plots self.description = description - self.show_stats = show_stats + self.show_stats = settings.show_stats if not settings.extended_report else None def get_description(self): return self.section_name @@ -190,7 +218,7 @@ def _plot_metric( ): """Split off plot histogram generation to allow for parallel processing""" # pick up static traffic light boundaries - name = feature + ":" + metric + name = f"{feature}:{metric}" sbounds = static_bounds.get(name, ()) # pick up dynamic traffic light boundaries names = [prefix + metric + suffix for suffix in suffices] diff --git a/popmon/visualization/templates/aggregated-overview.html b/popmon/visualization/templates/aggregated-overview.html index 64133aac..9ffc73f6 100644 --- a/popmon/visualization/templates/aggregated-overview.html +++ b/popmon/visualization/templates/aggregated-overview.html @@ -6,11 +6,11 @@
{% for i in [2, 1, 0] %} {% if vals[i] > 0 %} -
+
{{ vals[i] }} ({{ "%d" % (vals[i] / (vals["total"] * 0.01))}}%)
{% else %} -
 
+
 
{% endif %} {% endfor %}
diff --git a/popmon/visualization/templates/assets/css/custom-style.css b/popmon/visualization/templates/assets/css/custom-style.css index 80f56276..6505755d 100644 --- a/popmon/visualization/templates/assets/css/custom-style.css +++ b/popmon/visualization/templates/assets/css/custom-style.css @@ -64,17 +64,17 @@ table.overview tbody td.metric{ min-width: max-content; width: 200px; } -table.overview tbody td.cell{ - border: 1px solid #333333; +table.overview tbody td:not(.metric){ + border: 1px solid #333; text-align: center; } -table.overview td.cell-green{ +.g{ background: green; } -table.overview td.cell-yellow{ +.y{ background: rgba(255, 200, 0, 1.0); } -table.overview td.cell-red{ +.r{ background: rgba(255, 0, 0, 1.0); } table.overview tfoot td { @@ -93,16 +93,6 @@ table.overview tfoot td span{ } /* overview progress bars */ -.tl-bar.green{ - background-color: green; -} -.tl-bar.yellow{ - background-color: rgba(255, 200, 0, 1.0); -} -.tl-bar.red{ - background-color: rgba(255, 0, 0, 1.0); -} - .tl-bar{ padding-top:10px; padding-bottom:10px; diff --git a/popmon/visualization/templates/assets/js/custom-script.js b/popmon/visualization/templates/assets/js/custom-script.js index 9342c36d..7449648e 100644 --- a/popmon/visualization/templates/assets/js/custom-script.js +++ b/popmon/visualization/templates/assets/js/custom-script.js @@ -1,19 +1,30 @@ // hide all except first feature data for each section -$( "section" ).each(function() { - $( this ).find(".section_feature:not(:first)").hide(); +$("section").each(function() { + $(this).find(".section_feature:not(:first)").hide(); }); + // show corresponding feature's data based on the filter $(document).on("click", "button.dropdown-item", function() { obj = $(this) -// obj.closest("section").find("div.section_feature").hide() -// obj.closest("section").find("div[data-section-feature='" + obj.attr("data-feature") + "']").show() -// obj.parent().siblings("button").text("Feature: " + obj.text()) - // Linked dropdowns $("div.section_feature").hide() $("div[data-section-feature='" + obj.attr("data-feature") + "']").show() $("button.dropdown-toggle").text("Feature: " + obj.text()) + + // Current section (if any), e.g. #histograms + var type = window.location.hash.substr(1); + if (type.length > 0){ + // Find link to that section + var o = $("a.nav-link.js-scroll-trigger[href='#" + type +"'"); + + // If exists + if (o.length == 1){ + // Move to that location + var offset = $("section[data-section-title='" + o.attr("data-scroll-to-section") + "']").offset().top; + window.scrollTo(0, offset); + } + } }); $(document).on("click", "a.table-item", function(){ @@ -23,18 +34,28 @@ $(document).on("click", "a.table-item", function(){ // making navigation work: after clicking a nav link scrolling to the corresponding section's position $(document).on("click", "a.nav-link,a.navbar-brand", function(e) { - e.preventDefault(); + /*e.preventDefault();*/ obj = $(this) $([document.documentElement, document.body]).animate({ scrollTop: $("section[data-section-title='" + obj.attr("data-scroll-to-section") + "']").offset().top }, 1000); }); + +function slugify(str) { + // Convert string to id used in url + str = str.replace(/^\s+|\s+$/g, '').toLowerCase(); + str = str.replace(/[^a-z0-9 -]/g, '').replace(/\s+/g, '-').replace(/-+/g, '-'); + return str; +}; + // automatic insertion of navigation links based on section titles $('section').each(function(i, el){ title = $(this).attr("data-section-title"); - code = '' + slug = slugify(title) + $(this).attr('id', slug); + code = '' $("ul#navigation-sections").append(code); - if ( i === 0) { + if ( i === 0 ) { $("a.navbar-brand").attr('data-scroll-to-section', title); } }); diff --git a/popmon/visualization/templates/table.html b/popmon/visualization/templates/table.html index 6e3066d1..e3c94cef 100644 --- a/popmon/visualization/templates/table.html +++ b/popmon/visualization/templates/table.html @@ -14,15 +14,15 @@ {% for label in labels %} {% if data[metric][label] | length == 1 %} {% with class = data[metric][label][0] %} -   +   {% endwith %} {% elif data[metric][label] | length == 2 %} {% with rgba, value = data[metric][label] %} - {{ value }} + {{ value }} {% endwith %} {% elif data[metric][label] | length == 3 %} {% with text_color, rgba, value = data[metric][label] %} - {{ value }} + {{ value }} {% endwith %} {% endif %} {% endfor %} diff --git a/popmon/visualization/traffic_light_section_generator.py b/popmon/visualization/traffic_light_section_generator.py index 08bf6bb4..df3b3b41 100644 --- a/popmon/visualization/traffic_light_section_generator.py +++ b/popmon/visualization/traffic_light_section_generator.py @@ -25,12 +25,11 @@ from tqdm import tqdm from ..base import Module -from ..config import get_stat_description -from ..utils import filter_metrics, parallel, short_date +from ..config import Report +from ..utils import filter_metrics, short_date from ..visualization.utils import ( _prune, plot_traffic_lights_alerts_b64, - plot_traffic_lights_b64, plot_traffic_lights_overview, ) @@ -48,43 +47,26 @@ def __init__( self, read_key, store_key, - section_name, + settings: Report, features=None, ignore_features=None, - last_n=0, - skip_first_n=0, - skip_last_n=0, static_bounds=None, dynamic_bounds=None, prefix="traffic_light_", suffices=["_red_high", "_yellow_high", "_yellow_low", "_red_low"], ignore_stat_endswith=None, - skip_empty_plots=True, - description="", - show_stats=None, - plot_overview=True, - plot_metrics=False, ): """Initialize an instance of SectionGenerator. :param str read_key: key of input data to read from the datastore and use for plotting :param str store_key: key for output data to be stored in the datastore - :param str section_name: key of output data to store in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) :param str static_bounds: key to static traffic light bounds key in datastore (optional) :param str dynamic_bounds: key to dynamic traffic light bounds key in datastore (optional) :param str prefix: dynamic traffic light prefix. default is ``'traffic_light_'`` (optional) :param str suffices: dynamic traffic light suffices. (optional) :param list ignore_stat_endswith: ignore stats ending with any of list of suffices. (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param str description: description of the section. default is empty (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param bool plot_overview: heatmap overview of traffic lights (features x time) - :param bool plot_metrics: individual plot per feature """ super().__init__() self.read_key = read_key @@ -94,18 +76,17 @@ def __init__( self.features = features or [] self.ignore_features = ignore_features or [] - self.section_name = section_name - self.last_n = last_n - self.skip_first_n = skip_first_n - self.skip_last_n = skip_last_n + self.last_n = settings.last_n + self.skip_first_n = settings.skip_first_n + self.skip_last_n = settings.skip_last_n self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] - self.skip_empty_plots = skip_empty_plots - self.description = description - self.show_stats = show_stats - self.plot_overview = plot_overview - self.plot_metrics = plot_metrics + self.skip_empty_plots = settings.skip_empty_plots + self.show_stats = settings.show_stats if not settings.extended_report else None + + self.section_name = settings.section.traffic_lights.name + self.description = settings.section.traffic_lights.description def get_description(self): return self.section_name @@ -145,39 +126,22 @@ def transform( ) dates = [short_date(str(date)) for date in df.index.tolist()] - metrics = filter_metrics( - df.columns, self.ignore_stat_endswith, self.show_stats + metrics = sorted( + filter_metrics(df.columns, self.ignore_stat_endswith, self.show_stats) ) - plots = [] - if self.plot_overview: - plots.append( - _plot_metrics( - feature, - metrics, - dates, - df, - self.last_n, - self.skip_first_n, - self.skip_last_n, - self.skip_empty_plots, - ) + plots = [ + _plot_metrics( + feature, + metrics, + dates, + df, + self.last_n, + self.skip_first_n, + self.skip_last_n, + self.skip_empty_plots, ) - - if self.plot_metrics: - args = [ - ( - metric, - dates, - df[metric], - self.last_n, - self.skip_first_n, - self.skip_last_n, - self.skip_empty_plots, - ) - for metric in metrics - ] - plots += parallel(_plot_metric, args) + ] # filter out potential empty plots (from skip empty plots) if self.skip_empty_plots: @@ -196,21 +160,6 @@ def transform( return sections -def _plot_metric(metric, dates, values, last_n, skip_first_n, skip_last_n, skip_empty): - """Split off plot histogram generation to allow for parallel processing""" - - # prune dates and values - dates = _prune(dates, last_n, skip_first_n, skip_last_n) - values = _prune(values, last_n, skip_first_n, skip_last_n) - - # make plot. note: slow! - plot = plot_traffic_lights_b64( - data=np.array(values), labels=dates, skip_empty=skip_empty - ) - - return {"name": metric, "description": get_stat_description(metric), "plot": plot} - - def _plot_metrics( feature, metrics, @@ -237,7 +186,6 @@ def _plot_metrics( if len(values) > 0: values = np.stack(values) - # make plot. note: slow! if style == "heatmap": plot = plot_traffic_lights_overview( feature, values, metrics=nonempty_metrics, labels=dates diff --git a/popmon/visualization/utils.py b/popmon/visualization/utils.py index ceb6d6e1..694b44c1 100644 --- a/popmon/visualization/utils.py +++ b/popmon/visualization/utils.py @@ -20,6 +20,7 @@ import logging import math +from collections import defaultdict from io import BytesIO, StringIO from typing import List @@ -162,10 +163,9 @@ def plot_bars_b64(data, labels=None, bounds=None, ylim=False, skip_empty=True): def render_traffic_lights_table(feature, data, metrics: List[str], labels: List[str]): - colors = {} - color_map = ["green", "yellow", "red"] + colors = defaultdict(dict) + color_map = ["g", "y", "r"] for c1, metric in enumerate(metrics): - colors[metric] = {} for c2, label in enumerate(labels): colors[metric][label] = [color_map[data[c1][c2]]] @@ -184,9 +184,8 @@ def plot_traffic_lights_overview(feature, data, metrics=None, labels=None): def render_alert_aggregate_table(feature, data, metrics: List[str], labels: List[str]): - colors = {} + colors = defaultdict(dict) for c1, metric in enumerate(metrics): - colors[metric] = {} row_max = np.max(data[c1]) for c2, label in enumerate(labels): a = data[c1][c2] / row_max if row_max and row_max != 0 else 0 @@ -226,70 +225,6 @@ def plot_traffic_lights_alerts_b64(feature, data, metrics=None, labels=None): return render_alert_aggregate_table(feature, data.astype(int), metrics, labels) -def plot_traffic_lights_b64(data, labels=None, skip_empty=True): - """Plotting histogram data. - - :param np.array data: bin values of a histogram - :param labels: common bin labels for all histograms (optional) - :param bool skip_empty: if true, skip empty plots with only nans or only zeroes (optional) - - :return: base64 encoded plot image - :rtype: string - """ - # basic checks first - n = data.size # number of bins - if labels and len(labels) != n: - raise ValueError("shape mismatch: x-axis labels do not match the data shape") - - # skip plot generation for empty datasets - if skip_empty: - n_data = len(data) - n_zero = n_data - np.count_nonzero(data) - n_nan = pd.isnull(data).sum() - n_inf = np.sum([np.isinf(x) for x in data if isinstance(x, float)]) - if n_nan + n_zero + n_inf == n_data: - logger.debug("skipping plot with empty data.") - return "" - - fig, ax = plt.subplots() - - ax.yaxis.grid(True) - ax.xaxis.grid(False) - - colors = ["green", "yellow", "red"] - ones = np.ones(n) - - index = np.arange(n) - - for i, color in enumerate(colors): - mask = data == i - ax.bar( - index[mask], - ones[mask], - width=1, - align="center", - color=color, - alpha=0.8, - edgecolor="black", - ) - - ax.set_yticks([]) - - if labels: - ax.set_xticks(index) - ax.set_xticklabels(labels, fontdict={"rotation": "vertical"}) - granularity = math.ceil(len(labels) / 50) - [ - l.set_visible(False) - for (i, l) in enumerate(ax.xaxis.get_ticklabels()) - if i % granularity != 0 - ] - - fig.tight_layout() - - return plt_to_str(fig) - - def grouped_bar_chart_b64(data, labels, legend): """Plotting grouped histogram data. @@ -546,10 +481,9 @@ def xtick(lab): """Get x-tick.""" lab = str(lab) if len(lab) > top: - lab = lab[:17] + "..." + lab = lab[: top - 3] + "..." return lab - # plt.xlim((0.0, float(len(date)))) plt.xticks(tick_pos_x, date, fontsize=20, rotation=90) plt.yticks(tick_pos_y, [xtick(lab) for lab in labels], fontsize=20) im_ratio = values.shape[0] / values.shape[1] diff --git a/requirements.txt b/requirements.txt index 9b87d23f..db1ac73d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ joblib>=0.14.0 pybase64>=1.0.1 htmlmin ing_theme_matplotlib>=0.1.8 +pydantic \ No newline at end of file diff --git a/tests/popmon/alerting/test_apply_tl_bounds.py b/tests/popmon/alerting/test_apply_tl_bounds.py index e89dd412..176a2108 100644 --- a/tests/popmon/alerting/test_apply_tl_bounds.py +++ b/tests/popmon/alerting/test_apply_tl_bounds.py @@ -19,6 +19,7 @@ rolling_mean, ) from popmon.base import Pipeline +from popmon.config import Settings from popmon.visualization.section_generator import SectionGenerator @@ -225,16 +226,14 @@ def test_rolling_window_funcs(): def test_report_traffic_light_bounds(): datastore = {"to_profile": {"asc_numbers": get_test_data()}} - - conf = { - "monitoring_rules": { - "the_feature:mae": [8, 4, 2, 0.15], - "mse": [0.2, 0.11, 0.09, 0], - "mae": [1, 0, 0, 0], - "*_pull": [7, 4, -4, -7], - }, - "pull_rules": {"*_pull": [7, 4, -4, -7]}, + settings = Settings() + settings.monitoring.monitoring_rules = { + "the_feature:mae": [8, 4, 2, 0.15], + "mse": [0.2, 0.11, 0.09, 0], + "mae": [1, 0, 0, 0], + "*_pull": [7, 4, -4, -7], } + settings.monitoring.pull_rules = {"*_pull": [7, 4, -4, -7]} m1 = ApplyFunc( apply_to_key="to_profile", features=["asc_numbers"], metrics=["a", "b"] @@ -250,12 +249,12 @@ def test_report_traffic_light_bounds(): ctlb = ComputeTLBounds( read_key="to_profile", store_key="static_tlb", - monitoring_rules=conf["monitoring_rules"], + monitoring_rules=settings.monitoring.monitoring_rules, ) m3 = ComputeTLBounds( read_key="to_profile", - monitoring_rules=conf["pull_rules"], + monitoring_rules=settings.monitoring.pull_rules, apply_funcs_key="dynamic_tlb", func=pull_bounds, metrics_wide=True, @@ -272,6 +271,7 @@ def test_report_traffic_light_bounds(): section_name="Profiles", dynamic_bounds="dtlb", static_bounds="static_tlb", + settings=settings.report, ) pipeline = Pipeline(modules=[m1, m2, ctlb, m3, m4, rg]) diff --git a/tests/popmon/alerting/test_integration.py b/tests/popmon/alerting/test_integration.py index a5446c84..5d9c9d51 100644 --- a/tests/popmon/alerting/test_integration.py +++ b/tests/popmon/alerting/test_integration.py @@ -88,7 +88,6 @@ def test_traffic_light_summary(): output = datastore["alerts"]["the_feature"] - assert output["worst"].values[-1] == 2 assert output["n_green"].values[-1] == 1 assert output["n_yellow"].values[-1] == 0 assert output["n_red"].values[-1] == 1 @@ -137,7 +136,6 @@ def test_traffic_light_summary_combination(): assert "_AGGREGATE_" in alerts output = datastore["alerts"]["_AGGREGATE_"] - assert output["worst"].values[-1] == 2 assert output["n_green"].values[-1] == 1 assert output["n_yellow"].values[-1] == 0 assert output["n_red"].values[-1] == 1 diff --git a/tests/popmon/analysis/comparison/test_comparisons.py b/tests/popmon/analysis/comparison/test_comparisons.py new file mode 100644 index 00000000..154da9b7 --- /dev/null +++ b/tests/popmon/analysis/comparison/test_comparisons.py @@ -0,0 +1,100 @@ +import numpy as np +import pytest + +from popmon.analysis.comparison.comparisons import ( + jensen_shannon_divergence, + ks_prob, + ks_test, + kullback_leibler_divergence, + population_stability_index, + uu_chi2, +) + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in true_divide") +def test_uu_chi2(): + arr1 = np.array([1, 2, 0, 5]) + arr2 = np.array([3, 4, 0, 2]) + + chi2_value, chi2_norm, z_score, p_value, res = uu_chi2(arr1, arr2) + + np.testing.assert_equal(len(res) + 1, arr1.shape[0]) + np.testing.assert_almost_equal(chi2_value, 2.9036, 4) + np.testing.assert_almost_equal(chi2_norm, 1.45180, 4) + np.testing.assert_almost_equal(z_score, 0.7252, 4) + np.testing.assert_almost_equal(p_value, 0.2341, 4) + + +def test_ks_test(): + np.testing.assert_almost_equal(ks_test([2, 4], [1, 3]), 0.1291, 4) + np.testing.assert_equal(ks_test([1, 1], [5, 7]), ks_test([5, 7], [1, 1])) + + +def test_ks_prob(): + np.testing.assert_equal(ks_prob(0.1), 1) + np.testing.assert_equal(ks_prob(10.0), np.nan) + np.testing.assert_equal(ks_prob(0.4), ks_prob(-0.4)) + np.testing.assert_almost_equal(ks_prob(0.8), 0.5441, 4) + np.testing.assert_almost_equal(ks_prob(3.0), 0.0, 4) + + +def test_kl(): + np.testing.assert_almost_equal( + kullback_leibler_divergence( + np.array([0.25, 0.25, 0.25, 0.25]), np.array([0.85, 0.05, 0.05, 0.05]) + ), + 0.90105, + 4, + ) + np.testing.assert_almost_equal( + kullback_leibler_divergence( + np.array([0.85, 0.05, 0.05, 0.05]), np.array([0.25, 0.25, 0.25, 0.25]) + ), + 0.79875, + 4, + ) + np.testing.assert_equal( + kullback_leibler_divergence( + np.array([0.25, 0.25, 0.25, 0.25]), np.array([0.25, 0.25, 0.25, 0.25]) + ), + 0, + ) + np.testing.assert_equal( + kullback_leibler_divergence( + np.array([0.85, 0.05, 0.05, 0.05]), np.array([0.85, 0.05, 0.05, 0.05]) + ), + 0, + ) + np.testing.assert_almost_equal( + kullback_leibler_divergence( + np.array([0.0, 0.0, 0.0, 0.0]), np.array([0.0, 0.0, 0.0, 0.05]) + ), + 0, + 4, + ) + + +def test_psi(): + p = np.array([0.85, 0.05, 0.05, 0.05]) + q = np.array([0.25, 0.25, 0.25, 0.25]) + + np.testing.assert_almost_equal( + population_stability_index(p, q), 1.699815077214137, 4 + ) + np.testing.assert_almost_equal( + population_stability_index(p, q), population_stability_index(q, p), 4 + ) + np.testing.assert_almost_equal(population_stability_index(q, q), 0.0, 4) + + +def test_jsd(): + p = np.array([0.85, 0.05, 0.05, 0.05]) + q = np.array([0.25, 0.25, 0.25, 0.25]) + + # JSD is symmetric + np.testing.assert_almost_equal( + jensen_shannon_divergence(p, q), jensen_shannon_divergence(q, p), 4 + ) + + # JSD = 0 iff P=Q + np.testing.assert_almost_equal(jensen_shannon_divergence(q, q), 0, 4) diff --git a/tests/popmon/analysis/comparison/test_hist_comparer.py b/tests/popmon/analysis/comparison/test_hist_comparer.py index 466b5c56..0afb31a3 100644 --- a/tests/popmon/analysis/comparison/test_hist_comparer.py +++ b/tests/popmon/analysis/comparison/test_hist_comparer.py @@ -68,7 +68,6 @@ def test_hist_compare(): def test_reference_hist_comparer(): - hist_list = ["date:country", "date:bankrupt", "date:num_employees", "date:A_score"] features = ["country", "bankrupt", "num_employees", "A_score"] @@ -116,7 +115,7 @@ def test_reference_hist_comparer(): df = datastore["comparison"]["A_score"] assert len(df) == 16 - np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) + assert set(df.columns) == set(cols) np.testing.assert_almost_equal(df["ref_chi2"].mean(), 2.623206018518519) df = datastore["comparison"]["country"] @@ -136,7 +135,6 @@ def test_reference_hist_comparer(): def test_expanding_hist_comparer(): - hist_list = ["date:country", "date:bankrupt", "date:num_employees", "date:A_score"] features = ["country", "bankrupt", "num_employees", "A_score"] diff --git a/tests/popmon/analysis/profiling/test_profiles.py b/tests/popmon/analysis/profiling/test_profiles.py new file mode 100644 index 00000000..1f91242a --- /dev/null +++ b/tests/popmon/analysis/profiling/test_profiles.py @@ -0,0 +1,30 @@ +import numpy as np + +from popmon.analysis.profiling.profiles import ( + profile_fraction_of_true as fraction_of_true, +) + + +def test_fraction_of_true(): + res = fraction_of_true([], []) + assert np.isnan(res) + res = fraction_of_true(["a"], [10]) + assert np.isnan(res) + res = fraction_of_true(["a", "b", "c"], [10, 10, 10]) + assert np.isnan(res) + + res = fraction_of_true(np.array(["True", "False"]), np.array([0, 0])) + assert np.isnan(res) + res = fraction_of_true(np.array(["True", "False"]), np.array([10, 10])) + assert res == 0.5 + res = fraction_of_true(np.array([True, False]), [10, 10]) + assert res == 0.5 + + res = fraction_of_true(np.array(["True"]), np.array([10])) + assert res == 1.0 + res = fraction_of_true(np.array([True]), np.array([10])) + assert res == 1.0 + res = fraction_of_true(np.array(["False"]), np.array([10])) + assert res == 0.0 + res = fraction_of_true(np.array([False]), np.array([10])) + assert res == 0.0 diff --git a/tests/popmon/base/test_module.py b/tests/popmon/base/test_module.py index c5322b73..abd91728 100644 --- a/tests/popmon/base/test_module.py +++ b/tests/popmon/base/test_module.py @@ -1,32 +1,41 @@ import numpy as np +import pytest from popmon.base import Module -def test_popmon_module(): - class Scaler(Module): - _input_keys = ("input_key",) - _output_keys = ("output_key",) +class Scaler(Module): + _input_keys = ("input_key",) + _output_keys = ("output_key",) - def __init__(self, input_key, output_key, mean, std): - super().__init__() - self.input_key = input_key - self.output_key = output_key - self.mean = mean - self.std = std + def __init__(self, input_key, output_key, mean, std): + super().__init__() + self.input_key = input_key + self.output_key = output_key + self.mean = mean + self.std = std - def transform(self, input_array: np.ndarray): - res = input_array - np.mean(input_array) - res = res / np.std(res) - res = res * self.std - res = res + self.mean - return res + def transform(self, input_array: np.ndarray): + res = input_array - np.mean(input_array) + res = res / np.std(res) + res = res * self.std + res = res + self.mean + return res - test_module = Scaler(input_key="x", output_key="scaled_x", mean=2.0, std=0.3) +@pytest.fixture +def test_module(): + return Scaler(input_key="x", output_key="scaled_x", mean=2.0, std=0.3) + + +def test_popmon_module(test_module): datastore = {"x": np.arange(10)} datastore = test_module.transform(datastore) assert "x" in datastore # check if key 'x' is still in the datastore np.testing.assert_almost_equal(np.mean(datastore["scaled_x"]), 2.0, decimal=5) np.testing.assert_almost_equal(np.std(datastore["scaled_x"]), 0.3, decimal=5) + + +def test_popmon_module_repr(test_module): + assert str(test_module) == "Scaler(input_key='x', output_key='scaled_x')" diff --git a/tests/popmon/base/test_pipeline.py b/tests/popmon/base/test_pipeline.py index 79c22908..b3e06c01 100644 --- a/tests/popmon/base/test_pipeline.py +++ b/tests/popmon/base/test_pipeline.py @@ -1,6 +1,7 @@ import logging import numpy as np +import pytest from popmon.base import Module, Pipeline @@ -65,16 +66,12 @@ def transform(self, input_array: np.ndarray, weights: np.ndarray): return result -def test_popmon_pipeline(): +@pytest.fixture +def test_pipeline(): logger = logging.getLogger() logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) - datastore = {"x": np.array([7, 2, 7, 9, 6]), "weights": np.array([1, 1, 2, 1, 2])} - expected_result = np.sum( - np.power(np.log(datastore["x"]), 2) * datastore["weights"] - ) / np.sum(datastore["weights"]) - log_pow_pipeline = Pipeline( modules=[ LogTransformer(input_key="x", output_key="log_x"), @@ -92,5 +89,22 @@ def test_popmon_pipeline(): ], logger=logger, ) + return pipeline + - assert pipeline.transform(datastore)["res"] == expected_result +def test_popmon_pipeline(test_pipeline): + datastore = {"x": np.array([7, 2, 7, 9, 6]), "weights": np.array([1, 1, 2, 1, 2])} + expected_result = np.sum( + np.power(np.log(datastore["x"]), 2) * datastore["weights"] + ) / np.sum(datastore["weights"]) + + np.testing.assert_array_almost_equal( + test_pipeline.transform(datastore)["res"], expected_result, decimal=12 + ) + + +def test_pipeline_repr(test_pipeline): + assert ( + str(test_pipeline) + == """Pipeline: [\n\tPipeline: [\n\t\tLogTransformer(input_key='x', output_key='log_x')\n\t\tPowerTransformer(input_key='log_x', output_key='log_pow_x')\n\t]\n\tSumNormalizer(input_key='weights', output_key='norm_weights')\n\tWeightedSum(input_key='log_pow_x', weight_key='norm_weights', output_key='res')\n]""" + ) diff --git a/tests/popmon/base/test_registry.py b/tests/popmon/base/test_registry.py new file mode 100644 index 00000000..33a2c209 --- /dev/null +++ b/tests/popmon/base/test_registry.py @@ -0,0 +1,84 @@ +import pytest + +from popmon.base.registry import Registry + + +def test_registery(): + MyRegistry = Registry() + + @MyRegistry.register(key="example", description="hello world") + def example_function(my_input): + return 4 + my_input + + @MyRegistry.register( + key=["coefficient", "p_value"], + description=["phi_custom coefficient", "p-value for the phi_custom coeff."], + ) + def phi_custom(): + return 0.25, 0.0001 + + f = MyRegistry.get_func_by_name("example_function") + + # check that original function is intact + assert f(1) == 5 + assert example_function(1) == 5 + + # name should be the same the original function + assert f.__name__ == "example_function" + + assert MyRegistry.get_keys() == ["example", "coefficient", "p_value"] + assert MyRegistry.get_descriptions() == { + "example": "hello world", + "coefficient": "phi_custom coefficient", + "p_value": "p-value for the phi_custom coeff.", + } + + +def test_registry_properties(): + PropsRegistry = Registry() + + @PropsRegistry.register(key="hello", description="world", dim=3, htype="all") + def my_func(): + return 0 + + assert PropsRegistry.get_keys_by_dim_and_htype(dim=1, htype=None) == [] + assert PropsRegistry.get_keys_by_dim_and_htype(dim=3, htype="all") == ["hello"] + + +def test_registry_duplicate(): + DuplicatedRegistry = Registry() + + @DuplicatedRegistry.register(key="test", description="me") + def func1(): + pass + + with pytest.raises(ValueError) as e: + + @DuplicatedRegistry.register(key="another", description="value") # noqa: F811 + def func1(): # noqa: F811 + pass + + assert ( + e.value.args[0] + == "A function with the name 'func1' has already been registered." + ) + + with pytest.raises(ValueError) as e: + + @DuplicatedRegistry.register(key="test", description="duplicate") + def func2(): + pass + + assert e.value.args[0] == "Key 'test' has already been registered." + + +def test_registry_run(): + RunRegistry = Registry() + + @RunRegistry.register(key="key", description="run me", dim=1, htype="num") + def func(arg1, arg2): + return abs(arg1 - arg2) + + args = [1, 4] + result = RunRegistry.run(args=args, dim=1, htype="num") + assert result == {"key": 3} diff --git a/tests/popmon/pipeline/test_metrics.py b/tests/popmon/pipeline/test_metrics.py index df236a45..0fcc5dfa 100644 --- a/tests/popmon/pipeline/test_metrics.py +++ b/tests/popmon/pipeline/test_metrics.py @@ -3,11 +3,15 @@ from popmon import resources from popmon.base import Pipeline +from popmon.config import Settings from popmon.io import JsonReader from popmon.pipeline.metrics import df_stability_metrics, stability_metrics def test_hists_stability_metrics(): + settings = Settings() + settings.comparison.window = 5 + # get histograms pipeline = Pipeline( modules=[ @@ -28,7 +32,7 @@ def test_hists_stability_metrics(): "date:A_score:num_employees", ] ds = stability_metrics( - hists, reference_type="rolling", window=5, features=hist_list + hists, settings=settings, reference_type="rolling", features=hist_list ) cols = ["profiles", "comparisons", "traffic_lights", "alerts"] @@ -37,6 +41,8 @@ def test_hists_stability_metrics(): def test_df_stability_metrics(): + settings = Settings() + # generate metrics directly from dataframe features = ["date:isActive", "date:eyeColor", "date:latitude"] bin_specs = { @@ -47,7 +53,8 @@ def test_df_stability_metrics(): "latitude": {"bin_width": 5.0, "bin_offset": 0.0}, } ds = df_stability_metrics( - pytest.test_df, + df=pytest.test_df, + settings=settings, time_axis="date", features=features, binning="unit", diff --git a/tests/popmon/pipeline/test_report.py b/tests/popmon/pipeline/test_report.py index a2efe8d8..790e1d8e 100644 --- a/tests/popmon/pipeline/test_report.py +++ b/tests/popmon/pipeline/test_report.py @@ -3,12 +3,16 @@ from popmon import resources from popmon.base import Pipeline +from popmon.config import Report, Settings from popmon.hist.filling import get_bin_specs from popmon.io import JsonReader from popmon.pipeline.report import df_stability_report, stability_report def test_hists_stability_report(): + settings = Settings() + settings.comparison.window = 5 + # get histograms pipeline = Pipeline( modules=[ @@ -28,7 +32,9 @@ def test_hists_stability_report(): "date:A_score", "date:A_score:num_employees", ] - stability_report(hists, reference_type="rolling", window=5, features=hist_list) + stability_report( + hists, reference_type="rolling", settings=settings, features=hist_list + ) def test_df_stability_report(): @@ -49,9 +55,17 @@ def test_df_stability_report(): bin_specs=bin_specs, ) + settings = Report() + settings.last_n = 4 + # regenerate report, changing the plot window settings - rep.regenerate(last_n=4) - rep.regenerate(skip_first_n=1, skip_last_n=1) + rep.regenerate(report_settings=settings) + + settings.last_n = 0 + settings.skip_first_n = 1 + settings.skip_last_n = 1 + + rep.regenerate(report_settings=settings) def test_df_stability_report_self(): diff --git a/tests/popmon/pipeline/test_report_pipelines.py b/tests/popmon/pipeline/test_report_pipelines.py index 983a6212..22a7d478 100644 --- a/tests/popmon/pipeline/test_report_pipelines.py +++ b/tests/popmon/pipeline/test_report_pipelines.py @@ -1,5 +1,6 @@ from popmon import resources from popmon.base import Pipeline +from popmon.config import Settings from popmon.io import JsonReader from popmon.pipeline.report_pipelines import ( ExpandingReference, @@ -17,7 +18,7 @@ def test_self_reference(): JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ), - SelfReference(hists_key="hists", features=hist_list), + SelfReference(hists_key="hists", features=hist_list, settings=Settings()), ] ) pipeline.transform(datastore={}) @@ -35,6 +36,7 @@ def test_external_reference(): hists_key="hists", ref_hists_key="hists", features=hist_list, + settings=Settings(), ), ] ) @@ -44,6 +46,9 @@ def test_external_reference(): def test_rolling_reference(): hist_list = ["date:country", "date:A_score:num_employees"] + settings = Settings() + settings.comparison.window = 5 + pipeline = Pipeline( modules=[ JsonReader( @@ -51,7 +56,7 @@ def test_rolling_reference(): ), RollingReference( hists_key="hists", - window=5, + settings=settings, features=hist_list, ), ] @@ -67,7 +72,9 @@ def test_expanding_reference(): JsonReader( file_path=resources.data("example_histogram.json"), store_key="hists" ), - ExpandingReference(hists_key="hists", features=hist_list), + ExpandingReference( + hists_key="hists", features=hist_list, settings=Settings() + ), ] ) pipeline.transform(datastore={}) diff --git a/tests/popmon/stats/test_numpy.py b/tests/popmon/stats/test_numpy.py index e71d8939..0e8a9945 100644 --- a/tests/popmon/stats/test_numpy.py +++ b/tests/popmon/stats/test_numpy.py @@ -1,21 +1,13 @@ import itertools import numpy as np -import pytest from scipy import linalg, stats from popmon.stats.numpy import ( - fraction_of_true, - jensen_shannon_divergence, - ks_prob, - ks_test, - kullback_leibler_divergence, mean, - population_stability_index, probability_distribution_mean_covariance, quantile, std, - uu_chi2, ) @@ -26,31 +18,6 @@ def get_data(): return a, w -def test_fraction_of_true(): - res = fraction_of_true([], []) - assert np.isnan(res) - res = fraction_of_true(["a"], [10]) - assert np.isnan(res) - res = fraction_of_true(["a", "b", "c"], [10, 10, 10]) - assert np.isnan(res) - - res = fraction_of_true(np.array(["True", "False"]), np.array([0, 0])) - assert np.isnan(res) - res = fraction_of_true(np.array(["True", "False"]), np.array([10, 10])) - assert res == 0.5 - res = fraction_of_true(np.array([True, False]), [10, 10]) - assert res == 0.5 - - res = fraction_of_true(np.array(["True"]), np.array([10])) - assert res == 1.0 - res = fraction_of_true(np.array([True]), np.array([10])) - assert res == 1.0 - res = fraction_of_true(np.array(["False"]), np.array([10])) - assert res == 0.0 - res = fraction_of_true(np.array([False]), np.array([10])) - assert res == 0.0 - - def test_mean_shapes(): a, w = get_data() out = mean(a) @@ -225,95 +192,6 @@ def get_quantiles(q): ) -@pytest.mark.filterwarnings("ignore:invalid value encountered in true_divide") -def test_uu_chi2(): - arr1 = np.array([1, 2, 0, 5]) - arr2 = np.array([3, 4, 0, 2]) - - chi2_value, chi2_norm, z_score, p_value, res = uu_chi2(arr1, arr2) - - np.testing.assert_equal(len(res) + 1, arr1.shape[0]) - np.testing.assert_almost_equal(chi2_value, 2.9036, 4) - np.testing.assert_almost_equal(chi2_norm, 1.45180, 4) - np.testing.assert_almost_equal(z_score, 0.7252, 4) - np.testing.assert_almost_equal(p_value, 0.2341, 4) - - -def test_ks_test(): - np.testing.assert_almost_equal(ks_test([2, 4], [1, 3]), 0.1291, 4) - np.testing.assert_equal(ks_test([1, 1], [5, 7]), ks_test([5, 7], [1, 1])) - - -def test_ks_prob(): - np.testing.assert_equal(ks_prob(0.1), 1) - np.testing.assert_equal(ks_prob(10.0), np.nan) - np.testing.assert_equal(ks_prob(0.4), ks_prob(-0.4)) - np.testing.assert_almost_equal(ks_prob(0.8), 0.5441, 4) - np.testing.assert_almost_equal(ks_prob(3.0), 0.0, 4) - - -def test_kl(): - np.testing.assert_almost_equal( - kullback_leibler_divergence( - np.array([0.25, 0.25, 0.25, 0.25]), np.array([0.85, 0.05, 0.05, 0.05]) - ), - 0.90105, - 4, - ) - np.testing.assert_almost_equal( - kullback_leibler_divergence( - np.array([0.85, 0.05, 0.05, 0.05]), np.array([0.25, 0.25, 0.25, 0.25]) - ), - 0.79875, - 4, - ) - np.testing.assert_equal( - kullback_leibler_divergence( - np.array([0.25, 0.25, 0.25, 0.25]), np.array([0.25, 0.25, 0.25, 0.25]) - ), - 0, - ) - np.testing.assert_equal( - kullback_leibler_divergence( - np.array([0.85, 0.05, 0.05, 0.05]), np.array([0.85, 0.05, 0.05, 0.05]) - ), - 0, - ) - np.testing.assert_almost_equal( - kullback_leibler_divergence( - np.array([0.0, 0.0, 0.0, 0.0]), np.array([0.0, 0.0, 0.0, 0.05]) - ), - 0, - 4, - ) - - -def test_psi(): - p = np.array([0.85, 0.05, 0.05, 0.05]) - q = np.array([0.25, 0.25, 0.25, 0.25]) - - np.testing.assert_almost_equal( - population_stability_index(p, q), 1.699815077214137, 4 - ) - np.testing.assert_almost_equal( - population_stability_index(p, q), population_stability_index(q, p), 4 - ) - np.testing.assert_almost_equal(population_stability_index(q, q), 0.0, 4) - - -def test_jsd(): - p = np.array([0.85, 0.05, 0.05, 0.05]) - q = np.array([0.25, 0.25, 0.25, 0.25]) - - # JSD is symmetric - np.testing.assert_almost_equal( - jensen_shannon_divergence(p, q), jensen_shannon_divergence(q, p), 4 - ) - - # JSD = 0 iff P=Q - np.testing.assert_almost_equal(jensen_shannon_divergence(q, q), 0, 4) - - def test_probability_distribution_mean_covariance(): np.random.seed(42) n_bins = 10 diff --git a/tests/popmon/test_utils.py b/tests/popmon/test_utils.py index 460ff1ea..8e032d19 100644 --- a/tests/popmon/test_utils.py +++ b/tests/popmon/test_utils.py @@ -1,8 +1,10 @@ -from popmon.config import config +from popmon.config import Report from popmon.utils import filter_metrics def test_filter_metrics(): + settings = Report() + metrics = [ "distinct_pull", "filled_pull", @@ -19,7 +21,6 @@ def test_filter_metrics(): "fraction_true_trend10_zscore", "ref_unknown_labels", "prev1_ks_zscore", - "worst", "ref_max_prob_diff", ] expected = [ @@ -36,12 +37,9 @@ def test_filter_metrics(): "fraction_true_trend10_zscore", "ref_unknown_labels", "prev1_ks_zscore", - "worst", "ref_max_prob_diff", ] assert ( - filter_metrics( - metrics, ignore_stat_endswith=[], show_stats=config["limited_stats"] - ) + filter_metrics(metrics, ignore_stat_endswith=[], show_stats=settings.show_stats) == expected ) diff --git a/tests/popmon/visualization/test_report_generator.py b/tests/popmon/visualization/test_report_generator.py index 75ffa93e..81e3ce66 100644 --- a/tests/popmon/visualization/test_report_generator.py +++ b/tests/popmon/visualization/test_report_generator.py @@ -4,6 +4,7 @@ from popmon import resources from popmon.analysis.comparison.hist_comparer import ReferenceHistComparer from popmon.base import Pipeline +from popmon.config import Settings from popmon.hist.hist_splitter import HistSplitter from popmon.io import JsonReader from popmon.visualization import ReportGenerator, SectionGenerator @@ -17,6 +18,8 @@ def test_report_generator(): hist_list = ["date:country", "date:bankrupt", "date:num_employees", "date:A_score"] features = ["country", "bankrupt", "num_employees", "A_score"] + settings = Settings() + pipeline = Pipeline( modules=[ JsonReader( @@ -35,7 +38,7 @@ def test_report_generator(): read_key="comparison", store_key="all_sections", section_name="Comparisons", - last_n=2, + settings=settings.report, ), ReportGenerator(read_key="all_sections", store_key="final_report"), ] @@ -49,7 +52,8 @@ def test_report_generator(): for f in features: assert isinstance(datastore["comparison"][f], pd.DataFrame) - assert pipeline.modules[-2].last_n == 2 + assert isinstance(pipeline.modules[-2], SectionGenerator) + assert pipeline.modules[-2].last_n == 0 assert "final_report" in datastore assert ( isinstance(datastore["final_report"], str)