Skip to content

Commit

Permalink
fix: typeset invalid dates errors (#1678)
Browse files Browse the repository at this point in the history
* fix: ignore invalid dates during conversion

* fix: apply type conversion to user defined types

* test: add unit test to invalid date type convertion

* fix: add invalid dates to variable info

* fix(linting): code formatting

* test: update unit tests

* fix: rename to_datetime method
  • Loading branch information
alexbarros authored Dec 4, 2024
1 parent 816f1b7 commit 1e8cb89
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 1 deletion.
22 changes: 21 additions & 1 deletion src/ydata_profiling/model/pandas/describe_date_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
series_handle_nulls,
series_hashable,
)
from ydata_profiling.model.typeset_relations import is_pandas_1


def to_datetime(series: pd.Series) -> pd.Series:
if is_pandas_1():
return pd.to_datetime(series, errors="coerce")
return pd.to_datetime(series, format="mixed", errors="coerce")


@describe_date_1d.register
Expand All @@ -29,6 +36,12 @@ def pandas_describe_date_1d(
Returns:
A dict containing calculated series description values.
"""
og_series = series.dropna()
series = to_datetime(og_series)
invalid_values = og_series[series.isna()]

series = series.dropna()

if summary["value_counts_without_nan"].empty:
values = series.values
summary.update(
Expand All @@ -53,5 +66,12 @@ def pandas_describe_date_1d(
if config.vars.num.chi_squared_threshold > 0.0:
summary["chi_squared"] = chi_square(values)

summary.update(histogram_compute(config, values, summary["n_distinct"]))
summary.update(histogram_compute(config, values, series.nunique()))
summary.update(
{
"invalid_dates": invalid_values.nunique(),
"n_invalid_dates": len(invalid_values),
"p_invalid_dates": len(invalid_values) / summary["n"],
}
)
return config, values, summary
1 change: 1 addition & 0 deletions src/ydata_profiling/model/pandas/summary_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def pandas_describe_1d(
and series.name in typeset.type_schema
):
vtype = typeset.type_schema[series.name]

elif config.infer_dtypes:
# Infer variable types
vtype = typeset.infer_type(series)
Expand Down
10 changes: 10 additions & 0 deletions src/ydata_profiling/report/structure/variables/render_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
[
{"name": "Minimum", "value": fmt(summary["min"]), "alert": False},
{"name": "Maximum", "value": fmt(summary["max"]), "alert": False},
{
"name": "Invalid dates",
"value": fmt(summary["n_invalid_dates"]),
"alert": False,
},
{
"name": "Invalid dates (%)",
"value": fmt_percent(summary["p_invalid_dates"]),
"alert": False,
},
],
style=config.html.style,
)
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,3 +582,19 @@ def test_describe_list(summarizer, typeset):

with pytest.raises(NotImplementedError):
describe(config, "", [1, 2, 3], summarizer, typeset)


def test_decribe_series_type_schema(config, summarizer):
"Test describe with invalid date types."
typeset = ProfilingTypeSet(config, type_schema={"date": "datetime"})
data = {
"value": [1, 2, 3, 4],
"date": ["0001-01-01", "9999-12-31", "2022-10-03", "2022-10-04"],
}
df = pd.DataFrame(data)
result = describe(config, df, summarizer, typeset)

assert result.variables["date"]["type"] == "DateTime"
assert result.variables["date"]["n_missing"] == 0
assert result.variables["date"]["n_invalid_dates"] == 2
assert result.variables["date"]["p_invalid_dates"] == 0.5

0 comments on commit 1e8cb89

Please sign in to comment.