Skip to content

Commit

Permalink
fix: fix default number of histogram bins being extremely high (#167)
Browse files Browse the repository at this point in the history
The high number of bins caused extreme loss of performance for some
specific data.
For example, for the column `mass (g)` of the
`edvart.example_datasets.dataset_meteorite_landings()`, the number of
inferred bins is over 5M, even though the dataset contains under 50k
rows.
  • Loading branch information
mbelak-dtml committed Oct 10, 2023
1 parent 6072ea2 commit b0a1e2e
Show file tree
Hide file tree
Showing 4 changed files with 18,929 additions and 24,662 deletions.
29 changes: 18 additions & 11 deletions edvart/report_sections/group_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,24 +605,31 @@ def overlaid_histograms(
Name of column to analyze.
bins : int, optional
Number of bins in the histogram. If None, number of bin will be inferred using
Modified Freedman-Diaconis bin number inference.
Freedman-Diaconis bin number inference.
density : bool (default = True)
If True, histograms will be normalized to display density.
alpha : float
Opacity of individual histograms.
"""
# Modified Freedman-Diaconis bin number inference if bins is None
data_min = df[column].min()
data_max = df[column].max()
data_range = data_max - data_min
if bins is None:
iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
bin_width = 1 / np.cbrt(len(df)) * iqr
# Freedman-Diaconis bin number inference if bins is None
iqr = utils.iqr(df[column])
bin_width = 2 * iqr / (len(df[column]) ** (1 / 3))
bins = int(np.ceil(data_range / bin_width))
if bins > 1000:
# Use Sturges' rule if number of bins is too large
bins = int(np.ceil(np.log2(bins) + 1))
bin_width = data_range / bins
else:
bin_width = (df[column].max() - df[column].min()) / bins
bin_config = {
"start": df[column].min(),
"end": df[column].max(),
"size": bin_width,
}

bin_width = data_range / bins
bin_config = go.histogram.XBins(
start=data_min,
end=data_max,
size=bin_width,
)
# Choose color palette
colors = cl.scales["9"]["qual"]["Set1"]
color_idx = 0
Expand Down
15 changes: 13 additions & 2 deletions edvart/report_sections/univariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import matplotlib.pyplot as plt
import nbformat.v4 as nbfv4
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import HTML, Markdown, display
Expand Down Expand Up @@ -273,6 +274,16 @@ def histogram(
boxplot_kwargs = {}

series = series.dropna()
if bins is None:
bin_edges = np.histogram_bin_edges(series, bins="auto")
# Prevent too many bins, which slows down the visualization
# "auto" uses the maximum of the Sturges and Freedman-Diaconis estimators
# The Freedman-Diaconis rule can infer a huge number of bins for long-tailed distributions
# There should never be a good reason to use more than 1000 bins
if len(bin_edges) > 1000:
bins = "sturges"
else:
bins = bin_edges

if box_plot:
_fig, (ax_box, ax_hist) = plt.subplots(
Expand All @@ -284,7 +295,7 @@ def histogram(
sns.boxplot(x=series, ax=ax_box, **boxplot_kwargs)
sns.histplot(
data=series,
bins=bins or "auto",
bins=bins,
stat="density" if density else "count",
ax=ax_hist,
kde=False,
Expand All @@ -295,7 +306,7 @@ def histogram(
plt.figure(figsize=figsize)
sns.histplot(
data=series,
bins=bins or "auto",
bins=bins,
stat="density" if density else "count",
kde=False,
**distplot_kwargs,
Expand Down
Loading

0 comments on commit b0a1e2e

Please sign in to comment.