Skip to content

Commit

Permalink
feat: add density histogram (#1458)
Browse files Browse the repository at this point in the history
* feat: add histogram density option

* test: add unit test

* fix: discard weights if exceed max_bins
  • Loading branch information
alexbarros authored Sep 26, 2023
1 parent fdc0346 commit 1c500d5
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 7 deletions.
1 change: 1 addition & 0 deletions src/ydata_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class Histogram(BaseModel):
# Maximum number of bins (when bins=0)
max_bins: int = 250
x_axis_labels: bool = True
density: bool = False


class CatFrequencyPlot(BaseModel):
Expand Down
15 changes: 8 additions & 7 deletions src/ydata_profiling/model/summary_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ def histogram_compute(
weights: Optional[np.ndarray] = None,
) -> dict:
stats = {}
bins = config.plot.histogram.bins
bins_arg = "auto" if bins == 0 else min(bins, n_unique)
hist_config = config.plot.histogram
bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique)
bins = np.histogram_bin_edges(finite_values, bins=bins_arg)
stats[name] = np.histogram(finite_values, bins=bins, weights=weights)
if len(bins) > hist_config.max_bins:
bins = np.histogram_bin_edges(finite_values, bins=hist_config.max_bins)
weights = weights if weights and len(weights) == hist_config.max_bins else None

max_bins = config.plot.histogram.max_bins
if bins_arg == "auto" and len(stats[name][1]) > max_bins:
bins = np.histogram_bin_edges(finite_values, bins=max_bins)
stats[name] = np.histogram(finite_values, bins=bins, weights=None)
stats[name] = np.histogram(
finite_values, bins=bins, weights=weights, density=config.plot.histogram.density
)

return stats

Expand Down
25 changes: 25 additions & 0 deletions tests/unit/test_summary_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import pandas as pd
import pytest

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import (
describe_counts,
describe_generic,
describe_supported,
histogram_compute,
)


Expand Down Expand Up @@ -53,3 +55,26 @@ def test_summary_supported_empty_df(config, empty_data):
assert summary["p_distinct"] == 0
assert summary["n_unique"] == 0
assert not summary["is_unique"]


@pytest.fixture
def numpy_array():
return np.random.choice(list(range(10)), size=1000)


def test_compute_histogram(numpy_array):
config = Settings()
n_unique = len(np.unique(numpy_array))
hist = histogram_compute(config, numpy_array, n_unique)
assert "histogram" in hist
assert len(hist["histogram"][0]) == n_unique
assert len(hist["histogram"][1]) == n_unique + 1
assert sum(hist["histogram"][0]) == len(numpy_array)

config.plot.histogram.density = True
hist = histogram_compute(config, numpy_array, n_unique)
assert "histogram" in hist
assert len(hist["histogram"][0]) == n_unique
assert len(hist["histogram"][1]) == n_unique + 1
hist_values = hist["histogram"][0] * np.diff(hist["histogram"][1])
assert sum(hist_values) == pytest.approx(1, 0.1)

0 comments on commit 1c500d5

Please sign in to comment.