ing-bank · pradyot-09 · May 20, 2022 · May 22, 2022 · May 23, 2022 · May 25, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
             - tryceratops
         args: [ "--ignore=E501,E203,W503,TC003,TC101,TC300"]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.32.1
+    rev: v2.34.0
     hooks:
     -   id: pyupgrade
         args: ['--py36-plus','--exit-zero-even-if-changed']

diff --git a/README.rst b/README.rst
@@ -10,8 +10,8 @@ Population Shift Monitoring
 `popmon` works with both **pandas** and **spark datasets**.
 
 `popmon` creates histograms of features binned in time-slices,
-and compares the stability of the profiles and distributions of
-those histograms using statistical tests, both over time and with respect to a reference.
+and compares the stability of the `profiles <https://popmon.readthedocs.io/en/latest/profiles.html>`_ and distributions of
+those histograms using `statistical tests <https://popmon.readthedocs.io/en/latest/comparisons.html>`_, both over time and with respect to a reference.
 It works with numerical, ordinal, categorical features, and the histograms can be higher-dimensional, e.g. it can also track correlations between any two features.
 `popmon` can **automatically flag** and alert on **changes observed over time**, such
 as trends, shifts, peaks, outliers, anomalies, changing correlations, etc,
@@ -266,7 +266,7 @@ Copyright ING WBAA. `popmon` is completely free, open-source and licensed under
 .. |logo| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/popmon-logo.png
     :alt: POPMON logo
     :target: https://github.com/ing-bank/popmon
-.. |example| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/traffic_light_overview.png
+.. |example| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/report_overview.png
     :alt: Traffic Light Overview
 .. |pipeline| image:: https://raw.githubusercontent.com/ing-bank/popmon/master/docs/source/assets/pipeline.png
     :alt: Pipeline Visualization

diff --git a/commitlint.config.js b/commitlint.config.js
@@ -0,0 +1,5 @@
+module.exports = {
+  extends: ['@commitlint/config-conventional'],
+  rules: { 'footer-max-line-length': [1, 'always', 100] },
+  parserPreset: { parserOpts: { noteKeywords: ['\\[.+\\]:'] } },
+}
diff --git a/docs/source/assets/report_overview.png b/docs/source/assets/report_overview.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -14,6 +14,7 @@ Contents
    :maxdepth: 2
 
    introduction
+   profiles
    comparisons
    tutorials
    configuration

diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
@@ -11,15 +11,15 @@ To monitor the stability over time, we have developed popmon (**pop**\ ulation s
 
 For each column, the stability is determined by taking a reference (for example the data on which you have trained your classifier) and contrasting each time slot to this reference. This can be done in various ways:
 
-* Profiles: for example tracking the mean over time and contrasting this to the reference data. Similar analyses can be done with other summary statistics, such as median, min, max or quartiles.
+* :doc:`Profiles <profiles>`: for example tracking the mean over time and contrasting this to the reference data. Similar analyses can be done with other summary statistics, such as median, min, max or quartiles.
 * :doc:`Comparisons <comparisons>`: statistically comparing each time slot to the reference data (for example using Kolmogorov-Smirnov, chi-squared, or Pearson correlation).
 
 The reference can be defined in four different ways:
 
-1. Using the DataFrame on which you build the stability report as the reference, essentially allowing you to identify outlier time slots within the provided data.
-2. Using a separate reference DataFrame (for example the data on which your classifier was trained, as in the above example), allowing you to identify which time slots deviate from this reference DataFrame.
-3. Using a sliding window, allowing you to compare each time slot to a window of preceding time slots (by default the 10 preceding time slots).
-4. Using an expanding reference, allowing you to compare each time slot to all preceding time slots.
+#. Using the DataFrame on which you build the stability report as the reference, essentially allowing you to identify outlier time slots within the provided data.
+#. Using a separate reference DataFrame (for example the data on which your classifier was trained, as in the above example), allowing you to identify which time slots deviate from this reference DataFrame.
+#. Using a sliding window, allowing you to compare each time slot to a window of preceding time slots (by default the 10 preceding time slots).
+#. Using an expanding reference, allowing you to compare each time slot to all preceding time slots.
 
 We define the normalized residual of a value of interest with respect to the selected reference as:
 
@@ -40,9 +40,9 @@ To determine the difference compared to the reference, we also compute the value
 on the reference data (top panel) and determine the mean and standard deviations across time units
 (center panel). We then determine the traffic lights as follows:
 
-* 🟢 Green traffic light: indicates that there is no meaningful difference compared to the reference, i.e. the value of interest is less than four standard deviations away from the reference.
-* 🟡 Yellow traffic light: indicates that there is a moderate difference compared to the reference, i.e. the value of interest is between four and seven standard deviations away from the reference.
-* 🔴 Red traffic light: indicates that there is a big difference compared to the reference, i.e. the value of interest is more than seven standard deviations away from the reference.
+- 🟢 Green traffic light: indicates that there is no meaningful difference compared to the reference, i.e. the value of interest is less than four standard deviations away from the reference.
+- 🟡 Yellow traffic light: indicates that there is a moderate difference compared to the reference, i.e. the value of interest is between four and seven standard deviations away from the reference.
+- 🔴 Red traffic light: indicates that there is a big difference compared to the reference, i.e. the value of interest is more than seven standard deviations away from the reference.
 
 Of course, the exact thresholds (four and seven standard deviations) can be configured as a parameter. These traffic light bounds are then applied to the value of interest on the data from our initial DataFrame (bottom panel).
 

diff --git a/docs/source/profiles.rst b/docs/source/profiles.rst
@@ -0,0 +1,89 @@
+========
+Profiles
+========
+
+Profiles: tracking a metric over time
+
+Available profiles
+------------------
+The following metrics are implemented:
+
+Any dimension
+
+- count
+
+1D histogram, all types:
+
+- filled
+- underflow, overflow
+- nan
+
+1D histogram, numeric:
+
+- mean
+- 1%, 5%, 16%, 50% (median), 84%, 95%, 99% percentiles
+- std
+- min, max
+
+1D histogram, categorical
+
+- fraction of true
+
+2D histogram:
+
+- phik
+
+
+Custom profiles
+---------------
+
+Tracking custom metrics over time is easy.
+The following code snippet registers a new metric to ``popmon``.
+
+.. code-block:: python
+
+    import numpy as np
+
+    from popmon.analysis.profiling.profiles import Profiles
+
+
+    @Profiles.register(key="name_of_profile", description="<description_for_report>", dim=2)
+    def your_profile_function_name(hist) -> float:
+        """Write your function to profile the histogram."""
+        return np.sum(p)
+
+Variations:
+
+- A profile function may return multiple values for efficiency (e.g. quantiles do not need to be computed)
+
+.. code-block:: python
+
+    @Profiles.register(
+        key=["key1", "key2"], description=["Statistic 1", "Statistic 2"], dim=None
+    )
+    def your_profile_function_name(hist) -> float:
+        result1, result2 = your_logic(hist)
+        return result1, result2
+
+- A profile may work on the histogram, or on the value counts/labels (also for efficiency).
+This occurs when the ``htype`` parameter is passed (1D only)
+
+.. code-block:: python
+
+    @Profiles.register(
+        key="name_of_profile", description="<description_for_report>", dim=1, htype="all"
+    )
+    def your_profile_function_name(bin_labels, bin_counts) -> float:
+        return bin_counts.sum()
+
+- Profiles may depend on variable type (possible values for ``htype``: ``num``, ``cat``, ``all``).
+
+.. code-block:: python
+
+    @Profiles.register(
+        key="name_of_profile", description="<description_for_report>", dim=1, htype="num"
+    )
+    def your_profile_function_name(bin_labels, bin_counts) -> float:
+        return bin_counts.sum()
+
+If you developed a custom profiles that could be generically used, then please considering contributing it to the package.
diff --git a/examples/synthetic_data_streams/README.md b/examples/synthetic_data_streams/README.md
@@ -0,0 +1,40 @@
+# Synthetic Data Streams
+
+This directory contains reference configurations for several publicly available datasets that are widely for evaluating 
+the performance of algorithms dealing with dataset shift.
+
+## Datasets
+
+The following synthetic datasets are currently available:
+- Sine1, Sine2, Mixed, Stagger, Circles, LED [[Link]](https://github.com/alipsgh/data-streams)  
+- SEA, Hyperplane [[Link]](https://www.win.tue.nl/~mpechen/data/DriftSets/)
+
+| Dataset    | # Instances | # Features | # Classes | Drift type           |
+|------------|-------------|------------|-----------|----------------------|
+| Sine1      | 100.000     | 2          | 2         | Sudden               |
+| Sine2      | 100.000     | 2          | 2         | Sudden               |
+| Mixed      | 100.000     | 4          | 2         | Sudden               |
+| Stagger    | 100.000     | 3          | 2         | Sudden               |
+| Circles    | 100.000     | 2          | 2         | Gradual              |
+| LED        | 100.000     | 24         | 10        | Sudden               |
+| SEA        | 50.000      | 3          | 2         | Sudden               |
+| Hyperplane | 10.000      | 10         | 2         | Gradual; Incremental |
+
+_Characteristics of datasets used, see the survey [Learning under Concept Drift: A Review](https://arxiv.org/pdf/2004.05785.pdf) for more information._
+
+For the sudden-drift datasets, the drifting point is centred at every 5th of the instances for Sine1, Sine2 and Mixed and at each 3rd for Stagger, for a transition over 50 samples. 
+For the remaining gradually shifting datasets, Circles and LED, the drifting point is centred around every 4th, and takes place over 500 instances. 
+A noise level of 10\% is added to each dataset. 
+For the SEA dataset, the drifting points occur at each 4th of the dataset. 
+The shift in The Hyperplane dataset that was used, consists of 10.000 samples, and the drift is incremental and gradual.
+
+(adding other datasets will be simple based on the available reference configuration)
+
+## Getting started
+
+Follow these steps to produce a `popmon` report for a dataset:
+
+- Download the dataset from the URL above
+- Store the dataset in `data/`, and extract if compressed
+- Run the relevant reference configurations in this folder (e.g. `led.py`)
+- The HTML report will be generated in `reports/`
diff --git a/examples/synthetic_data_streams/circles.py b/examples/synthetic_data_streams/circles.py
@@ -0,0 +1,26 @@
+"""
+Example configuration for the circles dataset
+"""
+from synthetic_data_streams import (
+    dataset_summary,
+    load_arff,
+    synthetic_data_stream_report,
+)
+
+dataset_name = "circles_w_500_n_0.1"
+
+# Stream (101-200)
+v = "101"
+
+# Monitor the each feature w.r.t. the label
+features = ["index:x:class", "index:y:class", "index:x:y:class"]
+
+dataset_file = f"data/{dataset_name}/{dataset_name}_{v}.arff"
+report_file = f"reports/{dataset_name}_{v}.html"
+
+df = load_arff(dataset_file)
+
+dataset_summary(df)
+
+# Reduce the time_width for this smaller dataset
+synthetic_data_stream_report(df, features, report_file, time_width=1000)
diff --git a/examples/synthetic_data_streams/data/.gitignore b/examples/synthetic_data_streams/data/.gitignore
diff --git a/examples/synthetic_data_streams/hyperplane.py b/examples/synthetic_data_streams/hyperplane.py
@@ -0,0 +1,38 @@
+"""
+Example configuration for the hyperplane dataset
+"""
+from sklearn.linear_model import LogisticRegression
+from synthetic_data_streams import (
+    dataset_summary,
+    load_arff,
+    synthetic_data_stream_report,
+)
+
+dataset_name = "hyperplane"
+v = "1"
+
+# Monitor the each feature w.r.t. the label
+features = [f"index:attr{i}:output" for i in range(10)]
+
+# Also monitor predictions w.r.t. the label (see below)
+features += ["index:prediction:output"]
+
+dataset_file = f"data/{dataset_name}{v}.arff"
+report_file = f"reports/{dataset_name}_{v}.html"
+
+df = load_arff(dataset_file)
+
+# Fit a logistic regression on the first 10% of the data.
+model = LogisticRegression(C=1e5)
+model.fit(df.loc[:1000, df.columns != "output"], df.loc[:1000, "output"])
+
+# Use the model to predict over the full dataset
+df["prediction"] = model.predict_proba(df.loc[:, df.columns != "output"])[:, 1]
+
+dataset_summary(df)
+
+# The training set for the model will be used as reference.
+# The reduced time_width is because this is a smaller dataset compared to the rest
+synthetic_data_stream_report(
+    df, features, report_file, time_width=500, reference="start", split=1000
+)
diff --git a/examples/synthetic_data_streams/led.py b/examples/synthetic_data_streams/led.py
@@ -0,0 +1,41 @@
+"""
+Example configuration for the LED dataset
+"""
+from synthetic_data_streams import (
+    dataset_summary,
+    load_arff,
+    synthetic_data_stream_report,
+)
+
+dataset_name = "led_w_500_n_0.1"
+
+# Stream (101-200)
+v = "101"
+
+# Obtained by running once with:
+# features = []
+
+# Most alerts are found in: a7,a6,a4,a22,a15,a12,a0
+features = [
+    # Monitor the each feature w.r.t. the label
+    "index:a0:class",
+    "index:a4:class",
+    "index:a6:class",
+    "index:a7:class",
+    "index:a12:class",
+    "index:a15:class",
+    "index:a22:class",
+    # the relevant interactions correspond to 2^7 (128) * number of classes entries (10) per time slice
+    # "index:a0:a4:a6:a7:a12:a15:a22:class",
+]
+
+
+dataset_file = f"data/{dataset_name}/{dataset_name}_{v}.arff"
+report_file = f"reports/{dataset_name}_{v}.html"
+
+df = load_arff(dataset_file)
+
+dataset_summary(df)
+
+# Reduce the time_width for this smaller dataset
+synthetic_data_stream_report(df, features, report_file, time_width=1000)
diff --git a/examples/synthetic_data_streams/mixed.py b/examples/synthetic_data_streams/mixed.py
@@ -0,0 +1,29 @@
+"""
+Example configuration for the mixed dataset
+"""
+from synthetic_data_streams import (
+    dataset_summary,
+    load_arff,
+    synthetic_data_stream_report,
+)
+
+dataset_name = "mixed_w_50_n_0.1"
+
+# Stream (101-200)
+v = "101"
+
+# Monitor the feature distribution (equivalent to features=[])
+# features = ["index:v", "index:w", "index:x", "index:y", "index:class"]
+
+# Monitor the each feature w.r.t. the label
+features = ["index:v:class", "index:w:class", "index:x:class", "index:y:class"]
+
+dataset_file = f"data/{dataset_name}/{dataset_name}_{v}.arff"
+report_file = f"reports/{dataset_name}_{v}.html"
+
+df = load_arff(dataset_file)
+
+dataset_summary(df)
+
+# Reduce the time_width for this smaller dataset
+synthetic_data_stream_report(df, features, report_file, time_width=1000)
diff --git a/examples/synthetic_data_streams/reports/.gitignore b/examples/synthetic_data_streams/reports/.gitignore
diff --git a/examples/synthetic_data_streams/sea.py b/examples/synthetic_data_streams/sea.py
@@ -0,0 +1,30 @@
+"""
+Example configuration for the SEA dataset
+"""
+from synthetic_data_streams import (
+    dataset_summary,
+    load_arff,
+    synthetic_data_stream_report,
+)
+
+dataset_name = "sea"
+
+# Generate report with each feature (equivalent to [f"index:{feature}" for feature in df.columns])
+# features = []
+
+# Monitor interactions of each feature with the class variable
+# features = ["index:at1:cl", "index:at2:cl", "index:at3:cl"]
+
+# From the interactions, we see that only the first two features are relevant. We can monitor their interaction
+#  with the class variable in higher-dimension
+features = ["index:at1:at2:cl"]
+
+dataset_file = f"data/{dataset_name}.arff"
+report_file = f"reports/{dataset_name}.html"
+
+df = load_arff(dataset_file)
+
+dataset_summary(df)
+
+# 50.000 samples, then 2500 time-width results in 20 batches
+synthetic_data_stream_report(df, features, report_file, time_width=2500)