Skip to content

Commit

Permalink
fix: update config files to only calculate 'auto' correlation (#1158)
Browse files Browse the repository at this point in the history
  • Loading branch information
jtook authored and aquemy committed Nov 22, 2022
1 parent 3c14d43 commit 34cf73d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 25 deletions.
62 changes: 42 additions & 20 deletions examples/features/correlation_auto_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,45 @@
"""

# Download the UCI Bank Marketing Dataset- as seen in examples/bank_marketing_data/banking_data.py
file_name = cache_zipped_file(
"bank-full.csv",
"https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
)

df = pd.read_csv(file_name, sep=";")

profile = ProfileReport(
df, title="Profile Report of the UCI Bank Marketing Dataset", explorative=True
)


# The simplest way to change the number of bins is either through your script or notebook.
# This changes the granularity of the association measure for Numerical-Categorical column pairs.
profile.config.correlations["auto"].n_bins = 8


# The 'auto' correlation matrix is displayed with the other correlation matrices in the report.
profile.to_file(Path("uci_bank_marketing_report.html"))
if __name__ == "__main__":

# Download the UCI Bank Marketing Dataset- as seen in examples/bank_marketing_data/banking_data.py
file_name = cache_zipped_file(
"bank-full.csv",
"https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
)

df = pd.read_csv(file_name, sep=";")

# The default configuration automatically computes the 'Auto' correlation.
# You can change the number of bins setting for this calculation as shown below.
# This setting changes the granularity of the association measure for Numerical-Categorical column pairs.

profile = ProfileReport(
df,
title="Profile Report of the UCI Bank Marketing Dataset",
config_file="src/pandas_profiling/config_default.yaml",
correlations={
"auto": {"n_bins": 8},
},
)
# Saving the data profiling report with the 'auto' correlation matrix to a html file
profile.to_file(Path("auto_uci_bank_marketing_report.html"))

# The default configuration only computes the 'Auto' correlation.
# To deactivate this setting and instead calculate other types of correlations such as Pearson's
# and Cramer's V we can do the following:

no_auto_profile = ProfileReport(
df,
title="Profile Report of the UCI Bank Marketing Dataset",
config_file="src/pandas_profiling/config_default.yaml",
correlations={
"auto": {"calculate": False},
"pearson": {"calculate": True},
"cramers": {"calculate": True},
},
)

# We can then save the data profiling report without the 'auto' correlation matrix to a html file
no_auto_profile.to_file(Path("no_auto_uci_bank_marketing_report.html"))
10 changes: 5 additions & 5 deletions src/pandas_profiling/config_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,23 +90,23 @@ missing_diagrams:

correlations:
pearson:
calculate: true
calculate: false
warn_high_correlations: true
threshold: 0.9
spearman:
calculate: true
calculate: false
warn_high_correlations: false
threshold: 0.9
kendall:
calculate: true
calculate: false
warn_high_correlations: false
threshold: 0.9
phi_k:
calculate: true
calculate: false
warn_high_correlations: false
threshold: 0.9
cramers:
calculate: true
calculate: false
warn_high_correlations: true
threshold: 0.9
auto:
Expand Down

0 comments on commit 34cf73d

Please sign in to comment.