datamole-ai · lukany · Oct 10, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 10, 2023
@@ -605,24 +605,31 @@ def overlaid_histograms(
         Name of column to analyze.
     bins : int, optional
         Number of bins in the histogram. If None, number of bin will be inferred using
-        Modified Freedman-Diaconis bin number inference.
+        Freedman-Diaconis bin number inference.
     density : bool (default = True)
         If True, histograms will be normalized to display density.
     alpha : float
         Opacity of individual histograms.
     """
-    # Modified Freedman-Diaconis bin number inference if bins is None
+    data_min = df[column].min()
+    data_max = df[column].max()
+    data_range = data_max - data_min
     if bins is None:
-        iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
-        bin_width = 1 / np.cbrt(len(df)) * iqr
+        # Freedman-Diaconis bin number inference if bins is None
+        iqr = utils.iqr(df[column])
+        bin_width = 2 * iqr / (len(df[column]) ** (1 / 3))
+        bins = int(np.ceil(data_range / bin_width))
+        if bins > 1000:
+            # Use Sturges' rule if number of bins is too large
+            bins = int(np.ceil(np.log2(bins) + 1))
+            bin_width = data_range / bins
     else:
-        bin_width = (df[column].max() - df[column].min()) / bins
-    bin_config = {
-        "start": df[column].min(),
-        "end": df[column].max(),
-        "size": bin_width,
-    }
-
+        bin_width = data_range / bins
+    bin_config = go.histogram.XBins(
+        start=data_min,
+        end=data_max,
+        size=bin_width,
+    )
     # Choose color palette
     colors = cl.scales["9"]["qual"]["Set1"]
     color_idx = 0

@@ -2,6 +2,7 @@
 
 import matplotlib.pyplot as plt
 import nbformat.v4 as nbfv4
+import numpy as np
 import pandas as pd
 import seaborn as sns
 from IPython.display import HTML, Markdown, display
@@ -273,6 +274,16 @@ def histogram(
         boxplot_kwargs = {}
 
     series = series.dropna()
+    if bins is None:
+        bin_edges = np.histogram_bin_edges(series, bins="auto")
+        # Prevent too many bins, which slows down the visualization
+        # "auto" uses the maximum of the Sturges and Freedman-Diaconis estimators
+        # The Freedman-Diaconis rule can infer a huge number of bins for long-tailed distributions
+        # There should never be a good reason to use more than 1000 bins
+        if len(bin_edges) > 1000:
+            bins = "sturges"
+        else:
+            bins = bin_edges
 
     if box_plot:
         _fig, (ax_box, ax_hist) = plt.subplots(
@@ -284,7 +295,7 @@ def histogram(
         sns.boxplot(x=series, ax=ax_box, **boxplot_kwargs)
         sns.histplot(
             data=series,
-            bins=bins or "auto",
+            bins=bins,
             stat="density" if density else "count",
             ax=ax_hist,
             kde=False,
@@ -295,7 +306,7 @@ def histogram(
         plt.figure(figsize=figsize)
         sns.histplot(
             data=series,
-            bins=bins or "auto",
+            bins=bins,
             stat="density" if density else "count",
             kde=False,
             **distplot_kwargs,