From 0fabbf7a62b8cc4c3b3ce1ca32d2b2ea328d8cc7 Mon Sep 17 00:00:00 2001 From: Sohaib Arshid Date: Thu, 5 Jan 2023 17:11:54 +0100 Subject: [PATCH 1/6] fix: issue#915 error for large integers --- .../model/summary_algorithms.py | 4 ++- tests/issues/test_issue915.py | 30 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 tests/issues/test_issue915.py diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index de09144ad..014ea5a1e 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -36,6 +36,7 @@ def histogram_compute( stats = {} bins = config.plot.histogram.bins bins_arg = "auto" if bins == 0 else min(bins, n_unique) + bins_arg = np.histogram_bin_edges(finite_values, bins=bins_arg) stats[name] = np.histogram(finite_values, bins=bins_arg, weights=weights) max_bins = config.plot.histogram.max_bins @@ -49,7 +50,8 @@ def chi_square( values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None ) -> dict: if histogram is None: - histogram, _ = np.histogram(values, bins="auto") + bins = bins = np.histogram_bin_edges(values, bins='auto') + histogram, _ = np.histogram(values, bins=bins) return dict(chisquare(histogram)._asdict()) diff --git a/tests/issues/test_issue915.py b/tests/issues/test_issue915.py new file mode 100644 index 000000000..15ea33146 --- /dev/null +++ b/tests/issues/test_issue915.py @@ -0,0 +1,30 @@ +""" +Test for issue 915: +https://github.com/ydataai/pandas-profiling/issues/915 + +Error for series with large integers. +""" +import fnmatch +import pandas as pd +from pandas_profiling import ProfileReport + +def test_issue915(): + df = pd.DataFrame({"col": pd.Series([716277643516076032 + i for i in range(100)])}) + df_profile = ProfileReport(df) + + def test_with_value(n_extreme_obs): + """Generate HTML and validate the tabs contain the proper tab titles.""" + df_profile.config.n_extreme_obs = n_extreme_obs + df_profile.invalidate_cache() + + reg_min = f"*Minimum {n_extreme_obs} values*" + reg_max = f"*Maximum {n_extreme_obs} values*" + + profile_html = df_profile.to_html() + + assert fnmatch.fnmatch(profile_html, reg_min) + assert fnmatch.fnmatch(profile_html, reg_max) + + test_with_value(5) + test_with_value(100) + test_with_value(120) \ No newline at end of file From 56e102ae0d91324fa26a282a3fa99739e2a25b40 Mon Sep 17 00:00:00 2001 From: Sohaib Arshid Date: Thu, 5 Jan 2023 17:22:10 +0100 Subject: [PATCH 2/6] fix: issue#915 added histogram_bin_egdes with max_bins --- src/pandas_profiling/model/summary_algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index 014ea5a1e..5be607cb0 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -41,7 +41,8 @@ def histogram_compute( max_bins = config.plot.histogram.max_bins if bins_arg == "auto" and len(stats[name][1]) > max_bins: - stats[name] = np.histogram(finite_values, bins=max_bins, weights=None) + bins_arg = np.histogram_bin_edges(finite_values, bins=max_bins) + stats[name] = np.histogram(finite_values, bins=bins_arg, weights=None) return stats From 21d8042ea9347535d7cadd627e1711c32c641b02 Mon Sep 17 00:00:00 2001 From: Sohaib Arshid Date: Tue, 24 Jan 2023 11:02:25 +0100 Subject: [PATCH 3/6] fix: issue#915 lint issues --- src/pandas_profiling/model/summary_algorithms.py | 2 +- tests/issues/test_issue915.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index 5be607cb0..6416e9d16 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -51,7 +51,7 @@ def chi_square( values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None ) -> dict: if histogram is None: - bins = bins = np.histogram_bin_edges(values, bins='auto') + bins = bins = np.histogram_bin_edges(values, bins="auto") histogram, _ = np.histogram(values, bins=bins) return dict(chisquare(histogram)._asdict()) diff --git a/tests/issues/test_issue915.py b/tests/issues/test_issue915.py index 15ea33146..6d1703c6b 100644 --- a/tests/issues/test_issue915.py +++ b/tests/issues/test_issue915.py @@ -5,9 +5,12 @@ Error for series with large integers. """ import fnmatch + import pandas as pd + from pandas_profiling import ProfileReport + def test_issue915(): df = pd.DataFrame({"col": pd.Series([716277643516076032 + i for i in range(100)])}) df_profile = ProfileReport(df) @@ -27,4 +30,4 @@ def test_with_value(n_extreme_obs): test_with_value(5) test_with_value(100) - test_with_value(120) \ No newline at end of file + test_with_value(120) From 78cfb306ce6d88a3bfa32db81d31641be480c71e Mon Sep 17 00:00:00 2001 From: Sohaib Arshid Date: Tue, 24 Jan 2023 12:12:12 +0100 Subject: [PATCH 4/6] fix: issue#915 flake8 errors --- src/pandas_profiling/model/summary_algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index 6416e9d16..0bae2d7ab 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -51,7 +51,7 @@ def chi_square( values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None ) -> dict: if histogram is None: - bins = bins = np.histogram_bin_edges(values, bins="auto") + bins = np.histogram_bin_edges(values, bins="auto") histogram, _ = np.histogram(values, bins=bins) return dict(chisquare(histogram)._asdict()) From e689665c1245f7e968024b68330367956d0ed80d Mon Sep 17 00:00:00 2001 From: Sohaib Arshid Date: Tue, 24 Jan 2023 12:35:19 +0100 Subject: [PATCH 5/6] fix: issue#915 bin args --- src/pandas_profiling/model/summary_algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index 0bae2d7ab..6251a6c2b 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -36,8 +36,8 @@ def histogram_compute( stats = {} bins = config.plot.histogram.bins bins_arg = "auto" if bins == 0 else min(bins, n_unique) - bins_arg = np.histogram_bin_edges(finite_values, bins=bins_arg) - stats[name] = np.histogram(finite_values, bins=bins_arg, weights=weights) + bins = np.histogram_bin_edges(finite_values, bins=bins_arg) + stats[name] = np.histogram(finite_values, bins=bins, weights=weights) max_bins = config.plot.histogram.max_bins if bins_arg == "auto" and len(stats[name][1]) > max_bins: From 6a5409e6fa9f87fcfea60bda5632ca71462de791 Mon Sep 17 00:00:00 2001 From: Sohaib Arshid Date: Tue, 24 Jan 2023 15:01:05 +0100 Subject: [PATCH 6/6] fix: issue#915 rename var bin_args --- src/pandas_profiling/model/summary_algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pandas_profiling/model/summary_algorithms.py b/src/pandas_profiling/model/summary_algorithms.py index 6251a6c2b..07e47c97d 100644 --- a/src/pandas_profiling/model/summary_algorithms.py +++ b/src/pandas_profiling/model/summary_algorithms.py @@ -41,8 +41,8 @@ def histogram_compute( max_bins = config.plot.histogram.max_bins if bins_arg == "auto" and len(stats[name][1]) > max_bins: - bins_arg = np.histogram_bin_edges(finite_values, bins=max_bins) - stats[name] = np.histogram(finite_values, bins=bins_arg, weights=None) + bins = np.histogram_bin_edges(finite_values, bins=max_bins) + stats[name] = np.histogram(finite_values, bins=bins, weights=None) return stats