Skip to content

Commit

Permalink
fix: type schema not checking for empty columns (#1679)
Browse files Browse the repository at this point in the history
* fix: type schema not checking for empty columns

* fix: remove alerts unused parameters

* fix: indicate user defined type on empty columns

* fix(linting): code formatting

---------

Co-authored-by: Azory YData Bot <[email protected]>
  • Loading branch information
alexbarros and azory-ydata authored Dec 6, 2024
1 parent 1e8cb89 commit f3bc959
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 9 deletions.
4 changes: 2 additions & 2 deletions src/ydata_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def supported_alerts(summary: dict) -> List[Alert]:
return alerts


def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]:
def unsupported_alerts() -> List[Alert]:
alerts: List[Alert] = [
UnsupportedAlert(),
RejectedAlert(),
Expand All @@ -657,7 +657,7 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
alerts += generic_alerts(description)

if description["type"] == "Unsupported":
alerts += unsupported_alerts(description)
alerts += unsupported_alerts()
else:
alerts += supported_alerts(description)

Expand Down
24 changes: 18 additions & 6 deletions src/ydata_profiling/model/pandas/summary_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
from ydata_profiling.utils.dataframe import sort_column_names


def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool:
return (
isinstance(typeset, ProfilingTypeSet)
and typeset.type_schema
and series in typeset.type_schema
)


@describe_1d.register
def pandas_describe_1d(
config: Settings,
Expand All @@ -38,11 +46,10 @@ def pandas_describe_1d(
# Make sure pd.NA is not in the series
series = series.fillna(np.nan)

if (
isinstance(typeset, ProfilingTypeSet)
and typeset.type_schema
and series.name in typeset.type_schema
):
has_cast_type = _is_cast_type_defined(typeset, series.name)
cast_type = str(typeset.type_schema[series.name]) if has_cast_type else None

if has_cast_type and not series.isna().all():
vtype = typeset.type_schema[series.name]

elif config.infer_dtypes:
Expand All @@ -55,7 +62,12 @@ def pandas_describe_1d(
vtype = typeset.detect_type(series)

typeset.type_schema[series.name] = vtype
return summarizer.summarize(config, series, dtype=vtype)
summary = summarizer.summarize(config, series, dtype=vtype)
# Cast type is only used on unsupported columns rendering pipeline
# to indicate the correct variable type when inference is not possible
summary["cast_type"] = cast_type

return summary


@get_series_descriptions.register
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def render_generic(config: Settings, summary: dict) -> dict:
info = VariableInfo(
anchor_id=summary["varid"],
alerts=summary["alerts"],
var_type="Unsupported",
var_type=summary["cast_type"] or "Unsupported",
var_name=summary["varname"],
description=summary["description"],
style=config.html.style,
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/test_typeset_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,14 @@ def test_type_schema(dataframe: pd.DataFrame, column: str, type_schema: dict):
assert prof.typeset.type_schema[column] == prof.typeset._get_type(
type_schema[column]
)


def test_type_schema_with_null_column():
df = pd.DataFrame({"null_col": [None] * 100})
prof = ProfileReport(df, type_schema={"null_col": "datetime"})
description = prof.description_set
assert description.variables["null_col"]["type"] == "Unsupported"

prof = ProfileReport(df, type_schema={"null_col": "numeric"})
description = prof.description_set
assert description.variables["null_col"]["type"] == "Unsupported"

0 comments on commit f3bc959

Please sign in to comment.