From 13522d0b83c75abf2ad254a2912c63b395bcbc6d Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Thu, 4 Apr 2024 16:22:49 -0300 Subject: [PATCH] fix: zero division for categorical colums with 100% missing data (#1569) * fix: zero division for cat colums with only missing data * fix(linting): code formatting * fix: update default chisquare value * fix: boolean and date failures with empty data * fix(linting): code formatting --------- Co-authored-by: Azory YData Bot --- .../model/pandas/describe_boolean_pandas.py | 17 ++++++++--- .../pandas/describe_categorical_pandas.py | 8 +++-- .../model/pandas/describe_date_pandas.py | 30 ++++++++++++------- .../model/summary_algorithms.py | 4 +++ .../report/structure/variables/render_date.py | 3 +- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py index 492172e52..9b2014db7 100644 --- a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py @@ -1,5 +1,6 @@ from typing import Tuple +import numpy as np import pandas as pd from ydata_profiling.config import Settings @@ -26,9 +27,17 @@ def pandas_describe_boolean_1d( A dict containing calculated series description values. """ - value_counts = summary["value_counts_without_nan"] - summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]}) - - summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) + value_counts: pd.Series = summary["value_counts_without_nan"] + if not value_counts.empty: + summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]}) + summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) + else: + summary.update( + { + "top": np.nan, + "freq": 0, + "imbalance": 0, + } + ) return config, series, summary diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 5f72acebd..31ae57417 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -195,10 +195,14 @@ def length_summary_vc(vc: pd.Series) -> dict: summary = { "max_length": np.max(length_counts.index), - "mean_length": np.average(length_counts.index, weights=length_counts.values), + "mean_length": np.average(length_counts.index, weights=length_counts.values) + if not length_counts.empty + else np.nan, "median_length": weighted_median( length_counts.index.values, weights=length_counts.values - ), + ) + if not length_counts.empty + else np.nan, "min_length": np.min(length_counts.index), "length_histogram": length_counts, } diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index b8df2e3ad..1ff64a50f 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -29,16 +29,26 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ - summary.update( - { - "min": pd.Timestamp.to_pydatetime(series.min()), - "max": pd.Timestamp.to_pydatetime(series.max()), - } - ) - - summary["range"] = summary["max"] - summary["min"] - - values = series.values.astype(np.int64) // 10**9 + if summary["value_counts_without_nan"].empty: + values = series.values + summary.update( + { + "min": pd.NaT, + "max": pd.NaT, + "range": 0, + } + ) + else: + summary.update( + { + "min": pd.Timestamp.to_pydatetime(series.min()), + "max": pd.Timestamp.to_pydatetime(series.max()), + } + ) + + summary["range"] = summary["max"] - summary["min"] + + values = series.values.astype(np.int64) // 10**9 if config.vars.num.chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(values) diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index cd4b5743e..b97a72ca7 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -34,6 +34,8 @@ def histogram_compute( weights: Optional[np.ndarray] = None, ) -> dict: stats = {} + if len(finite_values) == 0: + return {name: []} hist_config = config.plot.histogram bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique) bins = np.histogram_bin_edges(finite_values, bins=bins_arg) @@ -54,6 +56,8 @@ def chi_square( if histogram is None: bins = np.histogram_bin_edges(values, bins="auto") histogram, _ = np.histogram(values, bins=bins) + if len(histogram) == 0 or np.sum(histogram) == 0: + return {"statistic": 0, "pvalue": 0} return dict(chisquare(histogram)._asdict()) diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index aa6850516..c75a80a5e 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -103,13 +103,14 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: ) # Bottom + n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0 bottom = Container( [ Image( hist_data, image_format=image_format, alt="Histogram", - caption=f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})", + caption=f"Histogram with fixed size bins (bins={n_bins})", name="Histogram", anchor_id=f"{varid}histogram", )