Skip to content

Commit

Permalink
fix: remove correlation calculation for constants (#1152)
Browse files Browse the repository at this point in the history
* fix: remove correlation calculation for constants

* test: ensure no correlation is calculated when columns are identical constants
  • Loading branch information
jtook authored and aquemy committed Nov 22, 2022
1 parent 70ee5c7 commit 1ed2bc0
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
13 changes: 9 additions & 4 deletions src/pandas_profiling/model/pandas/correlations_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def pandas_cramers_compute(
key
for key, value in summary.items()
if value["type"] in {"Categorical", "Boolean"}
and value["n_distinct"] <= threshold
and 1 < value["n_distinct"] <= threshold
}
)

Expand Down Expand Up @@ -161,16 +161,21 @@ def pandas_auto_compute(
config: Settings, df: pd.DataFrame, summary: dict
) -> Optional[pd.DataFrame]:
threshold = config.categorical_maximum_correlation_distinct

numerical_columns = [
key for key, value in summary.items() if value["type"] == "Numeric"
key
for key, value in summary.items()
if value["type"] == "Numeric" and value["n_distinct"] > 1
]
categorical_columns = [
key
for key, value in summary.items()
if value["type"] in {"Categorical", "Boolean"}
and value["n_distinct"] <= threshold
and 1 < value["n_distinct"] <= threshold
]

if len(numerical_columns + categorical_columns) <= 1:
return None

df_discretized = Discretizer(
DiscretizationType.UNIFORM, n_bins=config.correlations["auto"].n_bins
).discretize_dataframe(df)
Expand Down
6 changes: 2 additions & 4 deletions tests/issues/test_issue51.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,5 @@ def test_issue51_identical():
df, title="Pandas Profiling Report", progress_bar=False, explorative=True
)
report.config.vars.num.low_categorical_threshold = 0

assert (
report.get_description()["correlations"]["cramers"].values == np.ones((3, 3))
).all()
# this should not return any correlation value as the variables are identical constants
assert report.get_description()["correlations"] == {}

0 comments on commit 1ed2bc0

Please sign in to comment.