feat: implement imbalanced warning

ydataai · Dec 21, 2022 · ce84c81 · ce84c81
1 parent 46a08b5
commit ce84c81
Show file tree

Hide file tree

Showing 15 changed files with 106 additions and 4 deletions.
diff --git a/docsrc/source/pages/getting_started/overview.rst b/docsrc/source/pages/getting_started/overview.rst
@@ -47,7 +47,7 @@ For each column, the following information (whenever relevant for the column typ
 The report contains three additional sections: 
 
 * **Overview**: mostly global details about the dataset (number of records, number of variables, overall missigness and duplicates, memory footprint)
-* **Alerts**: a comprehensive and automatic list of potential data quality issues (high correlation, skewness, uniformity, zeros, missing values, constant values, between others) 
+* **Alerts**: a comprehensive and automatic list of potential data quality issues (high correlation, imbalance, skewness, uniformity, zeros, missing values, constant values, between others) 
 * **Reproduction**: technical details about the analysis (time, version and configuration)
 
 The package can be used via code but also directly as a CLI utility. The generated interactive report can be consumed and shared as regular HTML or embedded in an interactive way inside Jupyter Notebooks. 

diff --git a/docsrc/source/pages/tables/config_variables.csv b/docsrc/source/pages/tables/config_variables.csv
@@ -9,6 +9,8 @@ Parameter,Type,Default,Description
 ``vars.cat.characters``,boolean,``False``,"Check the distribution of characters and their Unicode properties. Often informative, but may be computationally expensive."
 ``vars.cat.words``,boolean,``False``,"Check the distribution of words. Often informative, but may be computationally expensive."
 ``vars.cat.cardinality_threshold``,integer,50,"Warn if the number of distinct values is above this threshold."
+``vars.cat.imbalance_threshold``,float,0.5,"Warn if the imbalance score is above this threshold."
 ``vars.cat.n_obs``,integer,5,"Display this number of observations."
 ``vars.cat.chi_squared_threshold``,float,0.999,"Same as above, but for categorical variables."
 ``vars.bool.n_obs``,integer,3,"Same as above, but for boolean variables."
+``vars.bool.imbalance_threshold``,float,0.5,"Warn if the imbalance score is above this threshold."
diff --git a/docsrc/source/pages/tables/data_quality_alerts.csv b/docsrc/source/pages/tables/data_quality_alerts.csv
@@ -3,6 +3,7 @@
 "``Zeros``","Column only contains zeros"
 "``High Correlation``","Correlations (either Spearman, Cramer, Pearson, Kendall, 𝜙k) are above the warning threshold (configurable)."
 "``High Cardinality``","Whether the column has more than 50 distinct values. Threshold is configurable."
+"``Imbalance``","Column is highly imbalanced. Threshold is configurable."
 "``Skewness``","Column's univariate distribution presents skewness. Threshold value is configurable."
 "``Missing Values``","Column has missing values"
 "``Infinite Values``","Column has infinite values (either ``np.inf`` or ``-np.inf``)"

diff --git a/src/pandas_profiling/config.py b/src/pandas_profiling/config.py
@@ -48,6 +48,7 @@ class CatVars(BaseModel):
     characters: bool = True
     words: bool = True
     cardinality_threshold: int = 50
+    imbalance_threshold: float = 0.5
     n_obs: int = 5
     # Set to zero to disable
     chi_squared_threshold: float = 0.999
@@ -59,6 +60,7 @@ class CatVars(BaseModel):
 
 class BoolVars(BaseModel):
     n_obs: int = 3
+    imbalance_threshold: float = 0.5
 
     # string to boolean mapping dict
     mappings: Dict[str, bool] = {

diff --git a/src/pandas_profiling/model/alerts.py b/src/pandas_profiling/model/alerts.py
@@ -35,6 +35,9 @@ class AlertType(Enum):
     SKEWED = auto()
     """This variable is highly skewed."""
 
+    IMBALANCE = auto()
+    """This variable is imbalanced."""
+
     MISSING = auto()
     """This variable contains missing values."""
 
@@ -224,6 +227,33 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
             )
         )
 
+    # Imbalance
+    if (
+        "imbalance" in summary
+        and summary["imbalance"] > config.vars.cat.imbalance_threshold
+    ):
+        alerts.append(
+            Alert(
+                alert_type=AlertType.IMBALANCE,
+                fields={"imbalance"},
+            )
+        )
+    return alerts
+
+
+def boolean_alerts(config: Settings, summary: dict) -> List[Alert]:
+    alerts = []
+    # Imbalance
+    if (
+        "imbalance" in summary
+        and summary["imbalance"] > config.vars.bool.imbalance_threshold
+    ):
+        alerts.append(
+            Alert(
+                alert_type=AlertType.IMBALANCE,
+                fields={"imbalance"},
+            )
+        )
     return alerts
 
 
@@ -302,6 +332,8 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
             alerts += numeric_alerts(config, description)
         if description["type"] == "TimeSeries":
             alerts += timeseries_alerts(config, description)
+        if description["type"] == "Boolean":
+            alerts += boolean_alerts(config, description)
 
     for idx in range(len(alerts)):
         alerts[idx].column_name = col

diff --git a/src/pandas_profiling/model/pandas/describe_boolean_pandas.py b/src/pandas_profiling/model/pandas/describe_boolean_pandas.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 from pandas_profiling.config import Settings
+from pandas_profiling.model.pandas.imbalance_pandas import column_imbalance_score
 from pandas_profiling.model.summary_algorithms import (
     describe_boolean_1d,
     series_hashable,
@@ -28,4 +29,6 @@ def pandas_describe_boolean_1d(
     value_counts = summary["value_counts_without_nan"]
     summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
 
+    summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
+
     return config, series, summary
diff --git a/src/pandas_profiling/model/pandas/describe_categorical_pandas.py b/src/pandas_profiling/model/pandas/describe_categorical_pandas.py
@@ -7,6 +7,7 @@
 import pandas as pd
 
 from pandas_profiling.config import Settings
+from pandas_profiling.model.pandas.imbalance_pandas import column_imbalance_score
 from pandas_profiling.model.pandas.utils_pandas import weighted_median
 from pandas_profiling.model.summary_algorithms import (
     chi_square,
@@ -229,6 +230,8 @@ def pandas_describe_categorical_1d(
     value_counts = summary["value_counts_without_nan"]
     value_counts.index = value_counts.index.astype(str)
 
+    summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
+
     redact = config.vars.cat.redact
     if not redact:
         summary.update({"first_rows": series.head(5)})

diff --git a/src/pandas_profiling/model/pandas/imbalance_pandas.py b/src/pandas_profiling/model/pandas/imbalance_pandas.py
@@ -0,0 +1,35 @@
+from typing import Union
+
+import pandas as pd
+from numpy import log2
+from scipy.stats import entropy
+
+
+def column_imbalance_score(
+    value_counts: pd.Series, n_classes: int
+) -> Union[float, int]:
+    """column_imbalance_score
+
+    The class balance score for categorical and boolean variables uses entropy to calculate a  bounded score between 0 and 1.
+    A perfectly uniform distribution would return a score of 0, and a perfectly imbalanced distribution would return a score of 1.
+
+    When dealing with probabilities with finite values (e.g categorical), entropy is maximised the ‘flatter’ the distribution is. (Jaynes: Probability Theory, The Logic of Science)
+    To calculate the class imbalance, we calculate the entropy of that distribution and the maximum possible entropy for that number of classes.
+    To calculate the entropy of the 'distribution' we use value counts (e.g frequency of classes) and we can determine the maximum entropy as log2(number of classes).
+    We then divide the entropy by the maximum possible entropy to get a value between 0 and 1 which we then subtract from 1.
+
+    Args:
+        value_counts (pd.Series): frequency of each category
+        n_classes (int): number of classes
+
+    Returns:
+        Union[float, int]: float or integer bounded between 0 and 1 inclusively
+    """
+    # return 0 if there is only one class (when entropy =0) as it is balanced.
+    # note that this also prevents a zero division error with log2(n_classes)
+    if n_classes > 1:
+        # casting to numpy array to ensure correct dtype when a categorical integer
+        # variable is evaluated
+        value_counts = value_counts.to_numpy(dtype=float)
+        return 1 - (entropy(value_counts, base=2) / log2(n_classes))
+    return 0
diff --git a/src/pandas_profiling/model/summarizer.py b/src/pandas_profiling/model/summarizer.py
@@ -7,6 +7,7 @@
 from pandas_profiling.config import Settings
 from pandas_profiling.model.handler import Handler
 from pandas_profiling.model.summary_algorithms import (
+    describe_boolean_1d,
     describe_categorical_1d,
     describe_counts,
     describe_date_1d,
@@ -58,7 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
             "Categorical": [
                 describe_categorical_1d,
             ],
-            "Boolean": [],
+            "Boolean": [
+                describe_boolean_1d,
+            ],
             "URL": [
                 describe_url_1d,
             ],

diff --git a/src/pandas_profiling/report/presentation/flavours/html/alerts.py b/src/pandas_profiling/report/presentation/flavours/html/alerts.py
@@ -10,6 +10,7 @@ def render(self) -> str:
             "type_date": "warning",
             "constant_length": "primary",
             "high_cardinality": "primary",
+            "imbalance": "primary",
             "unique": "primary",
             "uniform": "primary",
             "infinite": "info",

diff --git a/src/pandas_profiling/report/presentation/flavours/html/templates/alerts/alert_imbalance.html b/src/pandas_profiling/report/presentation/flavours/html/templates/alerts/alert_imbalance.html
@@ -0,0 +1 @@
+<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> is highly imbalanced ({{ alert.values['imbalance'] | fmt_percent}})
diff --git a/src/pandas_profiling/report/presentation/flavours/widget/alerts.py b/src/pandas_profiling/report/presentation/flavours/widget/alerts.py
@@ -25,6 +25,7 @@ def render(self) -> widgets.GridBox:
             "truncated": "info",
             "missing": "info",
             "skewed": "info",
+            "imbalance": "info",
             "high_correlation": "",
             "duplicates": "",
             "empty": "",

diff --git a/tests/unit/test_correlations.py b/tests/unit/test_correlations.py
@@ -3,7 +3,7 @@
 import pytest
 
 from pandas_profiling import ProfileReport
-from pandas_profiling.report.presentation.core import HTML, CorrelationTable, Image
+from pandas_profiling.report.presentation.core import CorrelationTable, Image
 from pandas_profiling.report.structure.correlations import get_correlation_items
 
 

diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py
@@ -49,4 +49,4 @@ def test_example(get_data_file, test_output_dir):
         type(profile.get_description()) == dict
         and len(profile.get_description().items()) == 10
     ), "Unexpected result"
-    assert "<span class=badge>12</span>" in profile.to_html()
+    assert "<span class=badge>14</span>" in profile.to_html()
diff --git a/tests/unit/test_pandas/test_imbalance.py b/tests/unit/test_pandas/test_imbalance.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+from pandas_profiling.model.pandas.imbalance_pandas import column_imbalance_score
+
+
+def test_column_imbalance_score_many_classes():
+    value_counts = pd.Series([10, 20, 60, 10])
+    assert column_imbalance_score(value_counts, len(value_counts)).round(2) == 0.21
+
+
+def test_column_imbalance_score_uniform_distribution():
+    value_counts = pd.Series([10, 10, 10, 10, 10])
+    assert column_imbalance_score(value_counts, len(value_counts)).round(2) == 0
+
+
+def test_column_imbalance_score_one_class():
+    value_counts = [30]
+    assert column_imbalance_score(value_counts, len(value_counts)) == 0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> is highly imbalanced ({{ alert.values['imbalance'] \| fmt_percent}})