Skip to content

Commit

Permalink
feat: implement imbalanced warning
Browse files Browse the repository at this point in the history
  • Loading branch information
jtook authored and aquemy committed Dec 21, 2022
1 parent 46a08b5 commit ce84c81
Show file tree
Hide file tree
Showing 15 changed files with 106 additions and 4 deletions.
2 changes: 1 addition & 1 deletion docsrc/source/pages/getting_started/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ For each column, the following information (whenever relevant for the column typ
The report contains three additional sections:

* **Overview**: mostly global details about the dataset (number of records, number of variables, overall missigness and duplicates, memory footprint)
* **Alerts**: a comprehensive and automatic list of potential data quality issues (high correlation, skewness, uniformity, zeros, missing values, constant values, between others)
* **Alerts**: a comprehensive and automatic list of potential data quality issues (high correlation, imbalance, skewness, uniformity, zeros, missing values, constant values, between others)
* **Reproduction**: technical details about the analysis (time, version and configuration)

The package can be used via code but also directly as a CLI utility. The generated interactive report can be consumed and shared as regular HTML or embedded in an interactive way inside Jupyter Notebooks.
Expand Down
2 changes: 2 additions & 0 deletions docsrc/source/pages/tables/config_variables.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Parameter,Type,Default,Description
``vars.cat.characters``,boolean,``False``,"Check the distribution of characters and their Unicode properties. Often informative, but may be computationally expensive."
``vars.cat.words``,boolean,``False``,"Check the distribution of words. Often informative, but may be computationally expensive."
``vars.cat.cardinality_threshold``,integer,50,"Warn if the number of distinct values is above this threshold."
``vars.cat.imbalance_threshold``,float,0.5,"Warn if the imbalance score is above this threshold."
``vars.cat.n_obs``,integer,5,"Display this number of observations."
``vars.cat.chi_squared_threshold``,float,0.999,"Same as above, but for categorical variables."
``vars.bool.n_obs``,integer,3,"Same as above, but for boolean variables."
``vars.bool.imbalance_threshold``,float,0.5,"Warn if the imbalance score is above this threshold."
1 change: 1 addition & 0 deletions docsrc/source/pages/tables/data_quality_alerts.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"``Zeros``","Column only contains zeros"
"``High Correlation``","Correlations (either Spearman, Cramer, Pearson, Kendall, 𝜙k) are above the warning threshold (configurable)."
"``High Cardinality``","Whether the column has more than 50 distinct values. Threshold is configurable."
"``Imbalance``","Column is highly imbalanced. Threshold is configurable."
"``Skewness``","Column's univariate distribution presents skewness. Threshold value is configurable."
"``Missing Values``","Column has missing values"
"``Infinite Values``","Column has infinite values (either ``np.inf`` or ``-np.inf``)"
Expand Down
2 changes: 2 additions & 0 deletions src/pandas_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class CatVars(BaseModel):
characters: bool = True
words: bool = True
cardinality_threshold: int = 50
imbalance_threshold: float = 0.5
n_obs: int = 5
# Set to zero to disable
chi_squared_threshold: float = 0.999
Expand All @@ -59,6 +60,7 @@ class CatVars(BaseModel):

class BoolVars(BaseModel):
n_obs: int = 3
imbalance_threshold: float = 0.5

# string to boolean mapping dict
mappings: Dict[str, bool] = {
Expand Down
32 changes: 32 additions & 0 deletions src/pandas_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class AlertType(Enum):
SKEWED = auto()
"""This variable is highly skewed."""

IMBALANCE = auto()
"""This variable is imbalanced."""

MISSING = auto()
"""This variable contains missing values."""

Expand Down Expand Up @@ -224,6 +227,33 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
)
)

# Imbalance
if (
"imbalance" in summary
and summary["imbalance"] > config.vars.cat.imbalance_threshold
):
alerts.append(
Alert(
alert_type=AlertType.IMBALANCE,
fields={"imbalance"},
)
)
return alerts


def boolean_alerts(config: Settings, summary: dict) -> List[Alert]:
alerts = []
# Imbalance
if (
"imbalance" in summary
and summary["imbalance"] > config.vars.bool.imbalance_threshold
):
alerts.append(
Alert(
alert_type=AlertType.IMBALANCE,
fields={"imbalance"},
)
)
return alerts


Expand Down Expand Up @@ -302,6 +332,8 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
alerts += numeric_alerts(config, description)
if description["type"] == "TimeSeries":
alerts += timeseries_alerts(config, description)
if description["type"] == "Boolean":
alerts += boolean_alerts(config, description)

for idx in range(len(alerts)):
alerts[idx].column_name = col
Expand Down
3 changes: 3 additions & 0 deletions src/pandas_profiling/model/pandas/describe_boolean_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd

from pandas_profiling.config import Settings
from pandas_profiling.model.pandas.imbalance_pandas import column_imbalance_score
from pandas_profiling.model.summary_algorithms import (
describe_boolean_1d,
series_hashable,
Expand All @@ -28,4 +29,6 @@ def pandas_describe_boolean_1d(
value_counts = summary["value_counts_without_nan"]
summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})

summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))

return config, series, summary
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd

from pandas_profiling.config import Settings
from pandas_profiling.model.pandas.imbalance_pandas import column_imbalance_score
from pandas_profiling.model.pandas.utils_pandas import weighted_median
from pandas_profiling.model.summary_algorithms import (
chi_square,
Expand Down Expand Up @@ -229,6 +230,8 @@ def pandas_describe_categorical_1d(
value_counts = summary["value_counts_without_nan"]
value_counts.index = value_counts.index.astype(str)

summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))

redact = config.vars.cat.redact
if not redact:
summary.update({"first_rows": series.head(5)})
Expand Down
35 changes: 35 additions & 0 deletions src/pandas_profiling/model/pandas/imbalance_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Union

import pandas as pd
from numpy import log2
from scipy.stats import entropy


def column_imbalance_score(
value_counts: pd.Series, n_classes: int
) -> Union[float, int]:
"""column_imbalance_score
The class balance score for categorical and boolean variables uses entropy to calculate a bounded score between 0 and 1.
A perfectly uniform distribution would return a score of 0, and a perfectly imbalanced distribution would return a score of 1.
When dealing with probabilities with finite values (e.g categorical), entropy is maximised the ‘flatter’ the distribution is. (Jaynes: Probability Theory, The Logic of Science)
To calculate the class imbalance, we calculate the entropy of that distribution and the maximum possible entropy for that number of classes.
To calculate the entropy of the 'distribution' we use value counts (e.g frequency of classes) and we can determine the maximum entropy as log2(number of classes).
We then divide the entropy by the maximum possible entropy to get a value between 0 and 1 which we then subtract from 1.
Args:
value_counts (pd.Series): frequency of each category
n_classes (int): number of classes
Returns:
Union[float, int]: float or integer bounded between 0 and 1 inclusively
"""
# return 0 if there is only one class (when entropy =0) as it is balanced.
# note that this also prevents a zero division error with log2(n_classes)
if n_classes > 1:
# casting to numpy array to ensure correct dtype when a categorical integer
# variable is evaluated
value_counts = value_counts.to_numpy(dtype=float)
return 1 - (entropy(value_counts, base=2) / log2(n_classes))
return 0
5 changes: 4 additions & 1 deletion src/pandas_profiling/model/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pandas_profiling.config import Settings
from pandas_profiling.model.handler import Handler
from pandas_profiling.model.summary_algorithms import (
describe_boolean_1d,
describe_categorical_1d,
describe_counts,
describe_date_1d,
Expand Down Expand Up @@ -58,7 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
"Categorical": [
describe_categorical_1d,
],
"Boolean": [],
"Boolean": [
describe_boolean_1d,
],
"URL": [
describe_url_1d,
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def render(self) -> str:
"type_date": "warning",
"constant_length": "primary",
"high_cardinality": "primary",
"imbalance": "primary",
"unique": "primary",
"uniform": "primary",
"infinite": "info",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> is highly imbalanced ({{ alert.values['imbalance'] | fmt_percent}})
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def render(self) -> widgets.GridBox:
"truncated": "info",
"missing": "info",
"skewed": "info",
"imbalance": "info",
"high_correlation": "",
"duplicates": "",
"empty": "",
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from pandas_profiling import ProfileReport
from pandas_profiling.report.presentation.core import HTML, CorrelationTable, Image
from pandas_profiling.report.presentation.core import CorrelationTable, Image
from pandas_profiling.report.structure.correlations import get_correlation_items


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ def test_example(get_data_file, test_output_dir):
type(profile.get_description()) == dict
and len(profile.get_description().items()) == 10
), "Unexpected result"
assert "<span class=badge>12</span>" in profile.to_html()
assert "<span class=badge>14</span>" in profile.to_html()
18 changes: 18 additions & 0 deletions tests/unit/test_pandas/test_imbalance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

from pandas_profiling.model.pandas.imbalance_pandas import column_imbalance_score


def test_column_imbalance_score_many_classes():
value_counts = pd.Series([10, 20, 60, 10])
assert column_imbalance_score(value_counts, len(value_counts)).round(2) == 0.21


def test_column_imbalance_score_uniform_distribution():
value_counts = pd.Series([10, 10, 10, 10, 10])
assert column_imbalance_score(value_counts, len(value_counts)).round(2) == 0


def test_column_imbalance_score_one_class():
value_counts = [30]
assert column_imbalance_score(value_counts, len(value_counts)) == 0

0 comments on commit ce84c81

Please sign in to comment.