feat: introduce auto parameter for correlations (#1095)

* feat: introduce discretization capabilities * feat: introduce 'auto' parameter to correlations * docs: make documentation after adding 'auto' option to the correlation metrics * feat: introduce n_bins as a parameter to 'auto' correlation * feat: introduce option for user to change n_bins argument for ‘auto’ correlation
ydataai · Oct 20, 2022 · 344b176 · 344b176
1 parent 744a40d
commit 344b176
Show file tree

Hide file tree

Showing 16 changed files with 337 additions and 6 deletions.
diff --git a/docsrc/source/pages/advanced_usage/available_settings.rst b/docsrc/source/pages/advanced_usage/available_settings.rst
@@ -91,6 +91,7 @@ For instance, to disable all correlation computations (may be relevant for large
     profile = df.profile_report(
         title="Report without correlations",
         correlations={
+            "auto": {"calculate": False},
             "pearson": {"calculate": False},
             "spearman": {"calculate": False},
             "kendall": {"calculate": False},

diff --git a/docsrc/source/pages/advanced_usage/corr_mat_access.rst b/docsrc/source/pages/advanced_usage/corr_mat_access.rst
@@ -19,7 +19,7 @@ The snippet below shows how to list the available correlation matrices:
     correlations = profile.description_set["correlations"]
     print(correlations.keys())
 
-In this case, all the 5 possible correlation metrics were computed and the output would be ``dict_keys(['spearman', 'pearson', 'kendall', 'cramers', 'phi_k'])``. Each ``correlations[CORRELATION_TYPE]`` object is a pandas ``DataFrame``. To access its values:
+In this case, all the 6 possible correlation metrics were computed and the output would be ``dict_keys(['auto', 'spearman', 'pearson', 'kendall', 'cramers', 'phi_k'])``. Each ``correlations[CORRELATION_TYPE]`` object is a pandas ``DataFrame``. To access its values:
 
 .. code-block:: python
   :caption: Accessing the values of the Pearson correlation

diff --git a/docsrc/source/pages/getting_started/overview.rst b/docsrc/source/pages/getting_started/overview.rst
@@ -37,7 +37,7 @@ For each column, the following information (whenever relevant for the column typ
 * **Descriptive statistics**: mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
 * **Most frequent and extreme values**
 * **Histograms:** categorical and numerical
-* **Correlations**: high correlation warnings, based on different correlation metrics (Spearman, Pearson, Kendall, Cramér's V, Phik)
+* **Correlations**: high correlation warnings, based on different correlation metrics (Spearman, Pearson, Kendall, Cramér's V, Phik, Auto)
 * **Missing values**: through counts, matrix, heatmap and dendrograms
 * **Duplicate rows**: list of the most common duplicated rows
 * **Text analysis**: most common categories (uppercase, lowercase, separator), scripts (Latin, Cyrillic) and blocks (ASCII, Cyrilic)

diff --git a/docsrc/source/pages/reference/api/_autosummary/pandas_profiling.config.Theme.rst b/docsrc/source/pages/reference/api/_autosummary/pandas_profiling.config.Theme.rst
@@ -19,5 +19,7 @@
 
       ~Theme.united
       ~Theme.flatly
+      ~Theme.cosmo
+      ~Theme.simplex
 
 
diff --git a/...source/pages/reference/api/_autosummary/pandas_profiling.model.correlations.rst b/...source/pages/reference/api/_autosummary/pandas_profiling.model.correlations.rst
@@ -26,6 +26,7 @@
 
    .. autosummary::
 
+      Auto
       Correlation
       Cramers
       Kendall

diff --git a/...source/pages/reference/api/_autosummary/pandas_profiling.visualisation.plot.rst b/...source/pages/reference/api/_autosummary/pandas_profiling.visualisation.plot.rst
@@ -18,7 +18,7 @@
       get_cmap_half
       get_correlation_font_size
       histogram
-      mini_histogram
+      plot_acf_pacf
       scatter_complex
       scatter_pairwise
       scatter_series

diff --git a/docsrc/source/pages/tables/config_correlations.csv b/docsrc/source/pages/tables/config_correlations.csv
@@ -13,4 +13,7 @@ Parameter,Type,Default,Description
 ``correlations.phi_k.threshold``,float,0.9,"Warning threshold"
 ``correlations.cramers.calculate``,boolean,``True``,"Whether to calculate this coefficient"
 ``correlations.cramers.warn_high_correlations``,boolean,``True``,"Show warning for correlations higher than the threshold"
-``correlations.cramers.threshold``,float,0.9,"Warning threshold"
+``correlations.cramers.threshold``,float,0.9,"Warning threshold"
+``correlations.auto.calculate``,boolean,``True``,"Whether to calculate this coefficient"
+``correlations.auto.warn_high_correlations``,boolean,``True``,"Show warning for correlations higher than the threshold"
+``correlations.auto.threshold``,float,0.9,"Warning threshold"
diff --git a/src/pandas_profiling/config.py b/src/pandas_profiling/config.py
@@ -208,6 +208,7 @@ class Correlation(BaseModel):
     calculate: bool = Field(default=True)
     warn_high_correlations: int = Field(default=10)
     threshold: float = Field(default=0.5)
+    n_bins: int = Field(default=10)
 
 
 class Correlations(BaseModel):
@@ -288,6 +289,7 @@ class Config:
     }
 
     correlations: Dict[str, Correlation] = {
+        "auto": Correlation(key="auto"),
         "spearman": Correlation(key="spearman"),
         "pearson": Correlation(key="pearson"),
         "kendall": Correlation(key="kendall"),
@@ -378,6 +380,7 @@ class Config:
             "dendrogram": False,
         },
         "correlations": {
+            "auto": {"calculate": False},
             "pearson": {"calculate": False},
             "spearman": {"calculate": False},
             "kendall": {"calculate": False},

diff --git a/src/pandas_profiling/config_default.yaml b/src/pandas_profiling/config_default.yaml
@@ -110,6 +110,10 @@ correlations:
       calculate: true
       warn_high_correlations: true
       threshold: 0.9
+    auto:
+      calculate: true
+      warn_high_correlations: true
+      threshold: 0.9
 
 
 # Bivariate / Pairwise relations

diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml
@@ -111,6 +111,10 @@ correlations:
       calculate: false
       warn_high_correlations: true
       threshold: 0.9
+    auto:
+       calculate: false
+       warn_high_correlations: true
+       threshold: 0.9
 
 
 # Bivariate / Pairwise relations

diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py
@@ -21,6 +21,15 @@ def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
         raise NotImplementedError()
 
 
+class Auto(Correlation):
+    @staticmethod
+    @multimethod
+    def compute(
+        config: Settings, df: Sized, summary: dict, n_bins: int
+    ) -> Optional[Sized]:
+        raise NotImplementedError()
+
+
 class Spearman(Correlation):
     @staticmethod
     @multimethod
@@ -71,7 +80,7 @@ def calculate_correlation(
     config: Settings, df: Sized, correlation_name: str, summary: dict
 ) -> Optional[Sized]:
     """Calculate the correlation coefficients between variables for the correlation types selected in the config
-    (pearson, spearman, kendall, phi_k, cramers).
+    (auto, pearson, spearman, kendall, phi_k, cramers).
 
     Args:
         config: report Settings object
@@ -87,6 +96,7 @@ def calculate_correlation(
         return None
 
     correlation_measures = {
+        "auto": Auto,
         "pearson": Pearson,
         "spearman": Spearman,
         "kendall": Kendall,

diff --git a/src/pandas_profiling/model/pandas/correlations_pandas.py b/src/pandas_profiling/model/pandas/correlations_pandas.py
@@ -1,20 +1,25 @@
 """Correlations between variables."""
 import itertools
 import warnings
-from typing import Optional
+from typing import Callable, Optional
 
 import numpy as np
 import pandas as pd
 from scipy import stats
 
 from pandas_profiling.config import Settings
 from pandas_profiling.model.correlations import (
+    Auto,
     Cramers,
     Kendall,
     Pearson,
     PhiK,
     Spearman,
 )
+from pandas_profiling.model.pandas.discretize_pandas import (
+    DiscretizationType,
+    Discretizer,
+)
 
 
 @Spearman.compute.register(Settings, pd.DataFrame, dict)
@@ -67,6 +72,14 @@ def _cramers_corrected_stat(confusion_matrix: pd.DataFrame, correction: bool) ->
     return corr
 
 
+def _pairwise_spearman(col_1: pd.Series, col_2: pd.Series) -> float:
+    return col_1.corr(col_2, method="spearman")
+
+
+def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float:
+    return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
+
+
 @Cramers.compute.register(Settings, pd.DataFrame, dict)
 def pandas_cramers_compute(
     config: Settings, df: pd.DataFrame, summary: dict
@@ -141,3 +154,56 @@ def pandas_phik_compute(
         correlation = phik_matrix(df[selected_cols], interval_cols=list(intcols))
 
     return correlation
+
+
+@Auto.compute.register(Settings, pd.DataFrame, dict, int)
+def pandas_auto_compute(
+    config: Settings,
+    df: pd.DataFrame,
+    summary: dict,
+    n_bins: int = 10,
+) -> Optional[pd.DataFrame]:
+    threshold = config.categorical_maximum_correlation_distinct
+
+    numerical_columns = [
+        key for key, value in summary.items() if value["type"] == "Numeric"
+    ]
+    categorical_columns = [
+        key
+        for key, value in summary.items()
+        if value["type"] in {"Categorical", "Boolean"}
+        and value["n_distinct"] <= threshold
+    ]
+    df_discretized = Discretizer(
+        DiscretizationType.UNIFORM, n_bins=n_bins
+    ).discretize_dataframe(df)
+    columns_tested = numerical_columns + categorical_columns
+    correlation_matrix = pd.DataFrame(
+        np.ones((len(columns_tested), len(columns_tested))),
+        index=columns_tested,
+        columns=columns_tested,
+    )
+    for col_1_name, col_2_name in itertools.combinations(columns_tested, 2):
+
+        method = (
+            _pairwise_spearman
+            if col_1_name and col_2_name not in categorical_columns
+            else _pairwise_cramers
+        )
+
+        def f(col_name: str, method: Callable) -> pd.Series:
+            return (
+                df_discretized
+                if col_name in numerical_columns and method is _pairwise_cramers
+                else df
+            )
+
+        score = method(
+            f(col_1_name, method)[col_1_name], f(col_2_name, method)[col_2_name]
+        )
+        (
+            correlation_matrix.loc[col_1_name, col_2_name],
+            correlation_matrix.loc[col_2_name, col_1_name],
+        ) = (score, score)
+
+    return correlation_matrix
diff --git a/src/pandas_profiling/model/pandas/discretize_pandas.py b/src/pandas_profiling/model/pandas/discretize_pandas.py
@@ -0,0 +1,81 @@
+from enum import Enum
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+
+class DiscretizationType(Enum):
+    UNIFORM = "uniform"
+    QUANTILE = "quantile"
+
+
+class Discretizer:
+    """
+    A class which enables the discretization of a pandas dataframe.
+    Perform this action when you want to convert a continuous variable
+    into a categorical variable.
+
+    Attributes:
+
+    method (DiscretizationType): this attribute controls how the buckets
+    of your discretization are formed. A uniform discretization type forms
+    the bins to be of equal width whereas a quantile discretization type
+    forms the bins to be of equal size.
+
+    n_bins (int): number of bins
+    reset_index (bool): instruction to reset the index of
+                        the dataframe after the discretization
+    """
+
+    def __init__(
+        self, method: DiscretizationType, n_bins: int = 10, reset_index: bool = False
+    ) -> None:
+        self.discretization_type = method
+        self.n_bins = n_bins
+        self.reset_index = reset_index
+
+    def discretize_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        """_summary_
+
+        Args:
+            dataframe (pd.DataFrame): pandas dataframe
+
+        Returns:
+            pd.DataFrame: discretized dataframe
+        """
+
+        discretized_df = dataframe.copy()
+        all_columns = dataframe.columns
+        num_columns = self._get_numerical_columns(dataframe)
+        for column in num_columns:
+            discretized_df.loc[:, column] = self._discretize_column(
+                discretized_df[column]
+            )
+
+        discretized_df = discretized_df[all_columns]
+        return (
+            discretized_df.reset_index(drop=True)
+            if self.reset_index
+            else discretized_df
+        )
+
+    def _discretize_column(self, column: pd.Series) -> pd.Series:
+        if self.discretization_type == DiscretizationType.QUANTILE:
+            return self._descritize_quantile(column)
+
+        elif self.discretization_type == DiscretizationType.UNIFORM:
+            return self._descritize_uniform(column)
+
+    def _descritize_quantile(self, column: pd.Series) -> pd.Series:
+        return pd.qcut(
+            column, q=self.n_bins, labels=False, retbins=False, duplicates="drop"
+        ).values
+
+    def _descritize_uniform(self, column: pd.Series) -> pd.Series:
+        return pd.cut(
+            column, bins=self.n_bins, labels=False, retbins=True, duplicates="drop"
+        )[0].values
+
+    def _get_numerical_columns(self, dataframe: pd.DataFrame) -> List[str]:
+        return dataframe.select_dtypes(include=np.number).columns.tolist()
diff --git a/src/pandas_profiling/report/structure/correlations.py b/src/pandas_profiling/report/structure/correlations.py
@@ -55,12 +55,20 @@ def get_correlation_items(config: Settings, summary: dict) -> Optional[Renderabl
     The empirical estimators used for Cramér's V have been proved to be biased, even for large samples.
     We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>."""
 
+    auto_description = """The auto setting is an easily interpretable pairwise column metric of the following mapping:
+                        vartype-vartype         : method, 
+                        categorical-categorical : Cramer's V, 
+                        numerical-categorical   : Cramer's V (using a discretized numerical column), 
+                        numerical-numerical     : Spearman's ρ. 
+                        This configuration uses the best suitable for each pair of columns."""
+
     key_to_data = {
         "pearson": (-1, "Pearson's r", pearson_description),
         "spearman": (-1, "Spearman's ρ", spearman_description),
         "kendall": (-1, "Kendall's τ", kendall_description),
         "phi_k": (0, "Phik (φk)", phi_k_description),
         "cramers": (0, "Cramér's V (φc)", cramers_description),
+        "auto": (0, "Auto", auto_description),
     }
 
     image_format = config.plot.image_format
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
        .. autosummary::
+          Auto
           Correlation
           Cramers
           Kendall
@@ Expand Down @@