Skip to content

Commit

Permalink
fix: zero division for categorical colums with 100% missing data (#1569)
Browse files Browse the repository at this point in the history
* fix: zero division for cat colums with only missing data

* fix(linting): code formatting

* fix: update default chisquare value

* fix: boolean and date failures with empty data

* fix(linting): code formatting

---------

Co-authored-by: Azory YData Bot <[email protected]>
  • Loading branch information
alexbarros and azory-ydata authored Apr 4, 2024
1 parent 7612150 commit 13522d0
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 17 deletions.
17 changes: 13 additions & 4 deletions src/ydata_profiling/model/pandas/describe_boolean_pandas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Tuple

import numpy as np
import pandas as pd

from ydata_profiling.config import Settings
Expand All @@ -26,9 +27,17 @@ def pandas_describe_boolean_1d(
A dict containing calculated series description values.
"""

value_counts = summary["value_counts_without_nan"]
summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})

summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
value_counts: pd.Series = summary["value_counts_without_nan"]
if not value_counts.empty:
summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
else:
summary.update(
{
"top": np.nan,
"freq": 0,
"imbalance": 0,
}
)

return config, series, summary
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,14 @@ def length_summary_vc(vc: pd.Series) -> dict:

summary = {
"max_length": np.max(length_counts.index),
"mean_length": np.average(length_counts.index, weights=length_counts.values),
"mean_length": np.average(length_counts.index, weights=length_counts.values)
if not length_counts.empty
else np.nan,
"median_length": weighted_median(
length_counts.index.values, weights=length_counts.values
),
)
if not length_counts.empty
else np.nan,
"min_length": np.min(length_counts.index),
"length_histogram": length_counts,
}
Expand Down
30 changes: 20 additions & 10 deletions src/ydata_profiling/model/pandas/describe_date_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,26 @@ def pandas_describe_date_1d(
Returns:
A dict containing calculated series description values.
"""
summary.update(
{
"min": pd.Timestamp.to_pydatetime(series.min()),
"max": pd.Timestamp.to_pydatetime(series.max()),
}
)

summary["range"] = summary["max"] - summary["min"]

values = series.values.astype(np.int64) // 10**9
if summary["value_counts_without_nan"].empty:
values = series.values
summary.update(
{
"min": pd.NaT,
"max": pd.NaT,
"range": 0,
}
)
else:
summary.update(
{
"min": pd.Timestamp.to_pydatetime(series.min()),
"max": pd.Timestamp.to_pydatetime(series.max()),
}
)

summary["range"] = summary["max"] - summary["min"]

values = series.values.astype(np.int64) // 10**9

if config.vars.num.chi_squared_threshold > 0.0:
summary["chi_squared"] = chi_square(values)
Expand Down
4 changes: 4 additions & 0 deletions src/ydata_profiling/model/summary_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def histogram_compute(
weights: Optional[np.ndarray] = None,
) -> dict:
stats = {}
if len(finite_values) == 0:
return {name: []}
hist_config = config.plot.histogram
bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique)
bins = np.histogram_bin_edges(finite_values, bins=bins_arg)
Expand All @@ -54,6 +56,8 @@ def chi_square(
if histogram is None:
bins = np.histogram_bin_edges(values, bins="auto")
histogram, _ = np.histogram(values, bins=bins)
if len(histogram) == 0 or np.sum(histogram) == 0:
return {"statistic": 0, "pvalue": 0}
return dict(chisquare(histogram)._asdict())


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,14 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
)

# Bottom
n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
bottom = Container(
[
Image(
hist_data,
image_format=image_format,
alt="Histogram",
caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
caption=f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})",
name="Histogram",
anchor_id=f"{varid}histogram",
)
Expand Down

0 comments on commit 13522d0

Please sign in to comment.