fix: fix default number of histogram bins being extremely high (#167)

The high number of bins caused extreme loss of performance for some specific data. For example, for the column `mass (g)` of the `edvart.example_datasets.dataset_meteorite_landings()`, the number of inferred bins is over 5M, even though the dataset contains under 50k rows.
datamole-ai · Oct 10, 2023 · b0a1e2e · b0a1e2e
1 parent 6072ea2
commit b0a1e2e
Show file tree

Hide file tree

Showing 4 changed files with 18,929 additions and 24,662 deletions.
diff --git a/edvart/report_sections/group_analysis.py b/edvart/report_sections/group_analysis.py
@@ -605,24 +605,31 @@ def overlaid_histograms(
         Name of column to analyze.
     bins : int, optional
         Number of bins in the histogram. If None, number of bin will be inferred using
-        Modified Freedman-Diaconis bin number inference.
+        Freedman-Diaconis bin number inference.
     density : bool (default = True)
         If True, histograms will be normalized to display density.
     alpha : float
         Opacity of individual histograms.
     """
-    # Modified Freedman-Diaconis bin number inference if bins is None
+    data_min = df[column].min()
+    data_max = df[column].max()
+    data_range = data_max - data_min
     if bins is None:
-        iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
-        bin_width = 1 / np.cbrt(len(df)) * iqr
+        # Freedman-Diaconis bin number inference if bins is None
+        iqr = utils.iqr(df[column])
+        bin_width = 2 * iqr / (len(df[column]) ** (1 / 3))
+        bins = int(np.ceil(data_range / bin_width))
+        if bins > 1000:
+            # Use Sturges' rule if number of bins is too large
+            bins = int(np.ceil(np.log2(bins) + 1))
+            bin_width = data_range / bins
     else:
-        bin_width = (df[column].max() - df[column].min()) / bins
-    bin_config = {
-        "start": df[column].min(),
-        "end": df[column].max(),
-        "size": bin_width,
-    }
-
+        bin_width = data_range / bins
+    bin_config = go.histogram.XBins(
+        start=data_min,
+        end=data_max,
+        size=bin_width,
+    )
     # Choose color palette
     colors = cl.scales["9"]["qual"]["Set1"]
     color_idx = 0

diff --git a/edvart/report_sections/univariate_analysis.py b/edvart/report_sections/univariate_analysis.py
@@ -2,6 +2,7 @@
 
 import matplotlib.pyplot as plt
 import nbformat.v4 as nbfv4
+import numpy as np
 import pandas as pd
 import seaborn as sns
 from IPython.display import HTML, Markdown, display
@@ -273,6 +274,16 @@ def histogram(
         boxplot_kwargs = {}
 
     series = series.dropna()
+    if bins is None:
+        bin_edges = np.histogram_bin_edges(series, bins="auto")
+        # Prevent too many bins, which slows down the visualization
+        # "auto" uses the maximum of the Sturges and Freedman-Diaconis estimators
+        # The Freedman-Diaconis rule can infer a huge number of bins for long-tailed distributions
+        # There should never be a good reason to use more than 1000 bins
+        if len(bin_edges) > 1000:
+            bins = "sturges"
+        else:
+            bins = bin_edges
 
     if box_plot:
         _fig, (ax_box, ax_hist) = plt.subplots(
@@ -284,7 +295,7 @@ def histogram(
         sns.boxplot(x=series, ax=ax_box, **boxplot_kwargs)
         sns.histplot(
             data=series,
-            bins=bins or "auto",
+            bins=bins,
             stat="density" if density else "count",
             ax=ax_hist,
             kde=False,
@@ -295,7 +306,7 @@ def histogram(
         plt.figure(figsize=figsize)
         sns.histplot(
             data=series,
-            bins=bins or "auto",
+            bins=bins,
             stat="density" if density else "count",
             kde=False,
             **distplot_kwargs,