feat: simplify configuration

ing-bank · Jun 15, 2022 · 3a23f39 · 3a23f39
1 parent 7d63afc
commit 3a23f39
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 272 deletions.
diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py
@@ -27,6 +27,7 @@
     make_histograms,
 )
 
+from ..config import Settings
 from ..pipeline.metrics_pipelines import create_metrics_pipeline
 
 logging.basicConfig(
@@ -37,13 +38,10 @@
 
 def stability_metrics(
     hists,
+    settings: Settings,
     reference_type="self",
     reference=None,
     time_axis="",
-    window=10,
-    shift=1,
-    monitoring_rules=None,
-    pull_rules=None,
     features=None,
     **kwargs,
 ):
@@ -54,47 +52,6 @@ def stability_metrics(
         default is 'self'.
     :param reference: histograms used as reference. default is None
     :param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided.
-    :param int window: size of rolling window and/or trend detection. default is 10.
-    :param int shift: shift of time-bins in rolling/expanding window. default is 1.
-    :param dict monitoring_rules: monitoring rules to generate traffic light alerts.
-        The default setting is:
-
-        .. code-block:: python
-
-            monitoring_rules = {
-                "*_pull": [7, 4, -4, -7],
-                "*_zscore": [7, 4, -4, -7],
-                "[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
-            }
-
-        Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
-        For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
-        You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
-        feature name in front. E.g.
-
-        .. code-block:: python
-
-            monitoring_rules = {
-                "featureA:*_pull": [5, 3, -3, -5],
-                "featureA:nan": [4, 1, 0, 0],
-                "*_pull": [7, 4, -4, -7],
-                "nan": [8, 1, 0, 0],
-            }
-
-        In case of multiple rules could apply for a feature's statistic, the most specific one applies.
-        So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
-        for all other features.
-    :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
-        Default is:
-
-        .. code-block:: python
-
-            pull_rules = {"*_pull": [7, 4, -4, -7]}
-
-        This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
-        and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
-        Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
-        (The same string logic applies as for monitoring_rules.)
     :param list features: histograms to pick up from the 'hists' dictionary (default is all keys)
     :param kwargs: residual keyword arguments passed on to report pipeline.
     :return: dict with results of metrics pipeline
@@ -103,32 +60,22 @@ def stability_metrics(
     if not isinstance(hists, dict):
         raise TypeError("hists should be a dict of histogrammar histograms.")
 
-    if not isinstance(monitoring_rules, dict):
-        monitoring_rules = {
-            "*_pull": [7, 4, -4, -7],
-            "*_zscore": [7, 4, -4, -7],
-            "[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
-        }
-    if not isinstance(pull_rules, dict):
-        pull_rules = {"*_pull": [7, 4, -4, -7]}
-
     if (isinstance(time_axis, str) and len(time_axis) == 0) or (
         isinstance(time_axis, bool) and time_axis
     ):
         # auto guess the time_axis: find the most frequent first column name in the histograms list
         first_cols = [k.split(":")[0] for k in list(hists.keys())]
         time_axis = max(set(first_cols), key=first_cols.count)
 
+    if reference_type == "external" and "ref_hists_key" not in kwargs:
+        kwargs["ref_hists_key"] = "ref_hists"
+
     pipeline = create_metrics_pipeline(
+        settings=settings,
         reference_type=reference_type,
         reference=reference,
         hists_key="hists",
-        ref_hists_key="ref_hists",
         time_axis=time_axis,
-        window=window,
-        shift=shift,
-        monitoring_rules=monitoring_rules,
-        pull_rules=pull_rules,
         features=features,
         **kwargs,
     )
@@ -143,6 +90,7 @@ def stability_metrics(
 def df_stability_metrics(
     df,
     time_axis,
+    settings: Settings,
     features=None,
     binning="auto",
     bin_specs=None,
@@ -151,10 +99,6 @@ def df_stability_metrics(
     var_dtype=None,
     reference_type="self",
     reference=None,
-    window=10,
-    shift=1,
-    monitoring_rules=None,
-    pull_rules=None,
     **kwargs,
 ):
     """Create a data stability monitoring html datastore for given pandas or spark dataframe.
@@ -204,47 +148,6 @@ def df_stability_metrics(
     :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding].
         default is 'self'.
     :param reference: reference dataframe or histograms. default is None
-    :param int window: size of rolling window and/or trend detection. default is 10.
-    :param int shift: shift of time-bins in rolling/expanding window. default is 1.
-    :param dict monitoring_rules: monitoring rules to generate traffic light alerts.
-        The default setting is:
-
-        .. code-block:: python
-
-            monitoring_rules = {
-                "*_pull": [7, 4, -4, -7],
-                "*_zscore": [7, 4, -4, -7],
-                "[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
-            }
-
-        Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
-        For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
-        You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
-        feature name in front. E.g.
-
-        .. code-block:: python
-
-            monitoring_rules = {
-                "featureA:*_pull": [5, 3, -3, -5],
-                "featureA:nan": [4, 1, 0, 0],
-                "*_pull": [7, 4, -4, -7],
-                "nan": [8, 1, 0, 0],
-            }
-
-        In case of multiple rules could apply for a feature's statistic, the most specific one applies.
-        So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
-        for all other features.
-    :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
-        Default is:
-
-        .. code-block:: python
-
-            pull_rules = {"*_pull": [7, 4, -4, -7]}
-
-        This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
-        and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
-        Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
-        (The same string logic applies as for monitoring_rules.)
     :param kwargs: residual keyword arguments, passed on to stability_report()
     :return: dict with results of metrics pipeline
     """
@@ -304,7 +207,6 @@ def df_stability_metrics(
         }
         bin_specs[time_axis] = time_specs
 
-    reference_hists = None
     if reference is not None:
         reference_type = "external"
         if isinstance(reference, dict):
@@ -331,6 +233,7 @@ def df_stability_metrics(
                 var_dtype,
                 ret_specs=True,
             )
+        kwargs["reference_hists"] = reference_hists
 
     # use the same features, bin_specs, time_axis, etc as for reference hists
     hists = make_histograms(
@@ -345,13 +248,9 @@ def df_stability_metrics(
     # generate data stability report
     return stability_metrics(
         hists,
-        reference_type,
-        reference_hists,
-        time_axis,
-        window,
-        shift,
-        monitoring_rules,
-        pull_rules,
-        features,
+        settings=settings,
+        reference_type=reference_type,
+        time_axis=time_axis,
+        features=features,
         **kwargs,
     )
diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py
@@ -68,12 +68,12 @@ def get_metrics_pipeline_class(reference_type, reference):
 
 
 def create_metrics_pipeline(
+    settings: Settings,
     reference_type="self",
     reference=None,
     hists_key="hists",
     time_axis="",
     features=None,
-    settings: Settings = None,
     **kwargs,
 ):
     # configuration and datastore for report pipeline