Skip to content

Commit

Permalink
feat: simplify configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed Jun 15, 2022
1 parent 7d63afc commit 3a23f39
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 272 deletions.
125 changes: 12 additions & 113 deletions popmon/pipeline/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
make_histograms,
)

from ..config import Settings
from ..pipeline.metrics_pipelines import create_metrics_pipeline

logging.basicConfig(
Expand All @@ -37,13 +38,10 @@

def stability_metrics(
hists,
settings: Settings,
reference_type="self",
reference=None,
time_axis="",
window=10,
shift=1,
monitoring_rules=None,
pull_rules=None,
features=None,
**kwargs,
):
Expand All @@ -54,47 +52,6 @@ def stability_metrics(
default is 'self'.
:param reference: histograms used as reference. default is None
:param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided.
:param int window: size of rolling window and/or trend detection. default is 10.
:param int shift: shift of time-bins in rolling/expanding window. default is 1.
:param dict monitoring_rules: monitoring rules to generate traffic light alerts.
The default setting is:
.. code-block:: python
monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
feature name in front. E.g.
.. code-block:: python
monitoring_rules = {
"featureA:*_pull": [5, 3, -3, -5],
"featureA:nan": [4, 1, 0, 0],
"*_pull": [7, 4, -4, -7],
"nan": [8, 1, 0, 0],
}
In case of multiple rules could apply for a feature's statistic, the most specific one applies.
So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
for all other features.
:param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
Default is:
.. code-block:: python
pull_rules = {"*_pull": [7, 4, -4, -7]}
This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
(The same string logic applies as for monitoring_rules.)
:param list features: histograms to pick up from the 'hists' dictionary (default is all keys)
:param kwargs: residual keyword arguments passed on to report pipeline.
:return: dict with results of metrics pipeline
Expand All @@ -103,32 +60,22 @@ def stability_metrics(
if not isinstance(hists, dict):
raise TypeError("hists should be a dict of histogrammar histograms.")

if not isinstance(monitoring_rules, dict):
monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
if not isinstance(pull_rules, dict):
pull_rules = {"*_pull": [7, 4, -4, -7]}

if (isinstance(time_axis, str) and len(time_axis) == 0) or (
isinstance(time_axis, bool) and time_axis
):
# auto guess the time_axis: find the most frequent first column name in the histograms list
first_cols = [k.split(":")[0] for k in list(hists.keys())]
time_axis = max(set(first_cols), key=first_cols.count)

if reference_type == "external" and "ref_hists_key" not in kwargs:
kwargs["ref_hists_key"] = "ref_hists"

pipeline = create_metrics_pipeline(
settings=settings,
reference_type=reference_type,
reference=reference,
hists_key="hists",
ref_hists_key="ref_hists",
time_axis=time_axis,
window=window,
shift=shift,
monitoring_rules=monitoring_rules,
pull_rules=pull_rules,
features=features,
**kwargs,
)
Expand All @@ -143,6 +90,7 @@ def stability_metrics(
def df_stability_metrics(
df,
time_axis,
settings: Settings,
features=None,
binning="auto",
bin_specs=None,
Expand All @@ -151,10 +99,6 @@ def df_stability_metrics(
var_dtype=None,
reference_type="self",
reference=None,
window=10,
shift=1,
monitoring_rules=None,
pull_rules=None,
**kwargs,
):
"""Create a data stability monitoring html datastore for given pandas or spark dataframe.
Expand Down Expand Up @@ -204,47 +148,6 @@ def df_stability_metrics(
:param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding].
default is 'self'.
:param reference: reference dataframe or histograms. default is None
:param int window: size of rolling window and/or trend detection. default is 10.
:param int shift: shift of time-bins in rolling/expanding window. default is 1.
:param dict monitoring_rules: monitoring rules to generate traffic light alerts.
The default setting is:
.. code-block:: python
monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
feature name in front. E.g.
.. code-block:: python
monitoring_rules = {
"featureA:*_pull": [5, 3, -3, -5],
"featureA:nan": [4, 1, 0, 0],
"*_pull": [7, 4, -4, -7],
"nan": [8, 1, 0, 0],
}
In case of multiple rules could apply for a feature's statistic, the most specific one applies.
So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
for all other features.
:param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
Default is:
.. code-block:: python
pull_rules = {"*_pull": [7, 4, -4, -7]}
This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
(The same string logic applies as for monitoring_rules.)
:param kwargs: residual keyword arguments, passed on to stability_report()
:return: dict with results of metrics pipeline
"""
Expand Down Expand Up @@ -304,7 +207,6 @@ def df_stability_metrics(
}
bin_specs[time_axis] = time_specs

reference_hists = None
if reference is not None:
reference_type = "external"
if isinstance(reference, dict):
Expand All @@ -331,6 +233,7 @@ def df_stability_metrics(
var_dtype,
ret_specs=True,
)
kwargs["reference_hists"] = reference_hists

# use the same features, bin_specs, time_axis, etc as for reference hists
hists = make_histograms(
Expand All @@ -345,13 +248,9 @@ def df_stability_metrics(
# generate data stability report
return stability_metrics(
hists,
reference_type,
reference_hists,
time_axis,
window,
shift,
monitoring_rules,
pull_rules,
features,
settings=settings,
reference_type=reference_type,
time_axis=time_axis,
features=features,
**kwargs,
)
2 changes: 1 addition & 1 deletion popmon/pipeline/metrics_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ def get_metrics_pipeline_class(reference_type, reference):


def create_metrics_pipeline(
settings: Settings,
reference_type="self",
reference=None,
hists_key="hists",
time_axis="",
features=None,
settings: Settings = None,
**kwargs,
):
# configuration and datastore for report pipeline
Expand Down
Loading

0 comments on commit 3a23f39

Please sign in to comment.