Skip to content

Commit

Permalink
Use nullable type handling in components' fit, transform, and predict…
Browse files Browse the repository at this point in the history
… methods (#4046)

* Remove existing nullable type handling from oversampler and use _handle_nullable_types instead

* Add handle call for lgbm regressor and remove existing handling

* Add handle call for lgbm classifier

* temp broken exp smoothing tests

* lint fix

* add release note

* Fix broken tests by initting woodwork on y in lgbm classifier

* Update tests

* Call handle in arima

* call handle from ts imputer y ltype is downcasted value

* remove unnecessary comments

* Fix time series guide

* lint fix

* Only call handle_nullable_types when necessary in methods

* Remove remaining unnecessary handle calls

* resolve remaining comments

* Add y ww init to ts imputer back in to fix tests

* Copy X in testing nullable types to stop hiding potential incompatibilities in methods

* use X_d in lgbm predict proba

* remove nullable type handling after sklearn upgrade fixed incompatibilities

* use common util to determine type for time series imputed integers

* Add comments around why we copy X

* remove _prepare_data from samplers

* PR comments

* remove tests to check if handle method is called

* remove nullable types from imputed data because of regularizer

* fix typo

* fix docstrings

* fix codecov issues

* PR comments

* Revert "Fix time series guide"

This reverts commit 964622a.

* return unchanged ltype in nullabl;e type utils

* add back ts imputer incompatibility test

* use dict get return value

* call handle nullable types in oversampler and check schema equality
  • Loading branch information
tamargrey authored and Tamar Grey committed Mar 27, 2023
1 parent 0c83183 commit abd16f7
Show file tree
Hide file tree
Showing 17 changed files with 267 additions and 114 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Release Notes
* Changes
* Calculated partial dependence grid values for integer data by rounding instead of truncating fractional values :pr:`4096`
* Remove unnecessary logic from imputer components prior to nullable type handling :pr:`4038`
* Added calls to ``_handle_nullable_types`` in component fit, transform, and predict methods when needed :pr:`4046`
* Documentation Changes
* Testing Changes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,11 @@ def fit(self, X, y=None):
self
"""
X = infer_feature_types(X)
X_encoded = self._encode_categories(X, fit=True)
y_encoded = self._encode_labels(y)
if y is not None:
y = infer_feature_types(y)
X_d, y_d = self._handle_nullable_types(X, y)
X_encoded = self._encode_categories(X_d, fit=True)
y_encoded = self._encode_labels(y_d)
self._component_obj.fit(X_encoded, y_encoded)
return self

Expand All @@ -204,7 +207,8 @@ def predict(self, X):
pd.DataFrame: Predicted values.
"""
X_encoded = self._encode_categories(X)
predictions = super().predict(X_encoded)
X_d, _ = self._handle_nullable_types(X_encoded)
predictions = super().predict(X_d)
if not self._label_encoder:
return predictions
predictions = self._label_encoder.inverse_transform(
Expand All @@ -222,4 +226,5 @@ def predict_proba(self, X):
pd.DataFrame: Predicted probability values.
"""
X_encoded = self._encode_categories(X)
return super().predict_proba(X_encoded)
X_d, _ = self._handle_nullable_types(X_encoded)
return super().predict_proba(X_d)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import (
downcast_int_nullable_to_double,
import_or_raise,
infer_feature_types,
)
Expand Down Expand Up @@ -213,10 +212,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
Raises:
ValueError: If y was not passed in.
"""
if X is not None:
X = downcast_int_nullable_to_double(X)
X = X.fillna(X.mean())
X, y = self._manage_woodwork(X, y)
X, y = self._handle_nullable_types(X, y)
if X is not None:
X = X.ww.fillna(X.mean())
if y is None:
raise ValueError("ARIMA Regressor requires y as input.")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
if y is None:
raise ValueError("Exponential Smoothing Regressor requires y as input.")

X, y = self._handle_nullable_types(X, y)

y = self._remove_datetime(y)

self._component_obj.fit(y=y)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from evalml.utils import (
SEED_BOUNDS,
_rename_column_names_to_numeric,
downcast_int_nullable_to_double,
import_or_raise,
infer_feature_types,
)
Expand Down Expand Up @@ -170,8 +169,8 @@ def fit(self, X, y=None):
X_encoded = self._encode_categories(X, fit=True)
if y is not None:
y = infer_feature_types(y)
X_encoded = downcast_int_nullable_to_double(X_encoded)
self._component_obj.fit(X_encoded, y)
X_d, y_d = self._handle_nullable_types(X_encoded, y)
self._component_obj.fit(X_d, y_d)
return self

def predict(self, X):
Expand All @@ -184,4 +183,5 @@ def predict(self, X):
pd.Series: Predicted values.
"""
X_encoded = self._encode_categories(X)
return super().predict(X_encoded)
X_d, _ = self._handle_nullable_types(X_encoded)
return super().predict(X_d)
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
"""Component that imputes missing data according to a specified timeseries-specific imputation strategy."""


import pandas as pd
import woodwork as ww
from woodwork.logical_types import BooleanNullable, Double
from woodwork.logical_types import (
BooleanNullable,
Double,
)

from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
from evalml.utils.nullable_type_utils import (
_determine_fractional_type,
_determine_non_nullable_equivalent,
)


class TimeSeriesImputer(Transformer):
Expand Down Expand Up @@ -52,7 +61,7 @@ class TimeSeriesImputer(Transformer):
# Incompatibility: https://github.com/alteryx/evalml/issues/4001
# TODO: Remove when support is added https://github.com/alteryx/evalml/issues/4014
_integer_nullable_incompatibilities = ["X", "y"]
_boolean_nullable_incompatibilities = ["X", "y"]
_boolean_nullable_incompatibilities = ["y"]

def __init__(
self,
Expand Down Expand Up @@ -155,35 +164,54 @@ def transform(self, X, y=None):
if y is not None:
y = infer_feature_types(y)

# This will change the logical type of BooleanNullable/IntegerNullable/AgeNullable columns with nans
# so we save the original schema to recreate it where possible after imputation
original_schema = X.ww.schema
X, y = self._handle_nullable_types(X, y)

X_not_all_null = X.ww.drop(self._all_null_cols)
X_schema = X_not_all_null.ww.schema
X_schema = X_schema.get_subset_schema(
subset_cols=X_schema._filter_cols(
exclude=["IntegerNullable", "BooleanNullable", "AgeNullable"],
),

# Because the TimeSeriesImputer is always used with the TimeSeriesRegularizer,
# many of the columns containing nans may have originally been non nullable logical types.
# We will use the non nullable equivalents where possible
original_schema = original_schema.get_subset_schema(
list(X_not_all_null.columns),
)
new_ltypes = {
col: _determine_non_nullable_equivalent(ltype)
for col, ltype in original_schema.logical_types.items()
}

if self._forwards_cols is not None:
X_forward = X.ww[self._forwards_cols]
X_forward = X[self._forwards_cols]
imputed = X_forward.pad()
imputed.bfill(inplace=True) # Fill in the first value, if missing
X_not_all_null[X_forward.columns] = imputed

if self._backwards_cols is not None:
X_backward = X.ww[self._backwards_cols]
X_backward = X[self._backwards_cols]
imputed = X_backward.bfill()
imputed.pad(inplace=True) # Fill in the last value, if missing
X_not_all_null[X_backward.columns] = imputed

if self._interpolate_cols is not None:
X_interpolate = X.ww[self._interpolate_cols]
# TODO: Revert when pandas introduces Float64 dtype
imputed = X_interpolate.astype(
float,
).interpolate() # Cast to float because Int64 not handled
X_interpolate = X_not_all_null[self._interpolate_cols]
imputed = X_interpolate.interpolate()
imputed.bfill(inplace=True) # Fill in the first value, if missing
X_not_all_null[X_interpolate.columns] = imputed
X_not_all_null.ww.init(schema=X_schema)

# Interpolate may add floating point values to integer data, so we
# have to update those logical types to a fractional type
int_cols_to_update = original_schema._filter_cols(
include=["IntegerNullable", "AgeNullable"],
)
new_int_ltypes = {
col: _determine_fractional_type(ltype)
for col, ltype in original_schema.logical_types.items()
if col in int_cols_to_update
}
new_ltypes.update(new_int_ltypes)
X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes)

y_imputed = pd.Series(y)
if y is not None and len(y) > 0:
Expand All @@ -194,10 +222,10 @@ def transform(self, X, y=None):
y_imputed = y.bfill()
y_imputed.pad(inplace=True)
elif self._impute_target == "interpolate":
# TODO: Revert when pandas introduces Float64 dtype
y_imputed = y.astype(float).interpolate()
y_imputed = y.interpolate()
y_imputed.bfill(inplace=True)
y_imputed = ww.init_series(y_imputed)
# Re-initialize woodwork with the downcast logical type
y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)

return X_not_all_null, y_imputed

Expand Down
44 changes: 5 additions & 39 deletions evalml/pipelines/components/transformers/samplers/base_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import copy
from abc import abstractmethod

from woodwork.logical_types import IntegerNullable

from evalml.pipelines.components.transformers import Transformer
from evalml.utils.woodwork_utils import infer_feature_types

Expand Down Expand Up @@ -36,7 +34,8 @@ def fit(self, X, y):
"""
if y is None:
raise ValueError("y cannot be None")
X_ww, y_ww = self._prepare_data(X, y)
X_ww = infer_feature_types(X)
y_ww = infer_feature_types(y)
self._initialize_sampler(X_ww, y_ww)
return self

Expand All @@ -49,41 +48,7 @@ def _initialize_sampler(self, X, y):
y (pd.Series): The target data.
"""

def _prepare_data(self, X, y):
"""Transforms the input data to pandas data structure that our sampler can ingest.
Args:
X (pd.DataFrame): Training features.
y (pd.Series): Target.
Returns:
pd.DataFrame, pd.Series: Prepared X and y data as pandas types
"""
X = infer_feature_types(X)
int_nullable_cols = X.ww.select(IntegerNullable).columns
if len(int_nullable_cols) > 0:
try:
X = X.astype(
{
null_col: int
for null_col in X.ww.select(IntegerNullable).columns
},
)
except ValueError:
X = X.astype(
{
null_col: float
for null_col in X.ww.select(IntegerNullable).columns
},
)
X.ww.init(schema=X.ww.schema)

if y is None:
raise ValueError("y cannot be None")
y = infer_feature_types(y)
return X, y

def transform(self, X, y=None):
def transform(self, X, y):
"""Transforms the input data by sampling the data.
Args:
Expand All @@ -93,7 +58,8 @@ def transform(self, X, y=None):
Returns:
pd.DataFrame, pd.Series: Transformed features and target.
"""
X, y = self._prepare_data(X, y)
X = infer_feature_types(X)
y = infer_feature_types(y)

categorical_columns = X.ww.select("Categorical", return_schema=True).columns
for col in categorical_columns:
Expand Down
40 changes: 34 additions & 6 deletions evalml/pipelines/components/transformers/samplers/oversampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,46 @@ def fit(self, X, y):
Returns:
self
"""
X_ww, y_ww = self._prepare_data(X, y)
X_ww = infer_feature_types(X)
if y is None:
raise ValueError("y cannot be None")
y_ww = infer_feature_types(y)

sampler_name = self._get_best_oversampler(X_ww)
self.sampler = self.sampler_options[sampler_name]

# get categorical features first, if necessary
if sampler_name == "SMOTENC":
self._get_categorical(X)
super().fit(X, y)
self._get_categorical(X_ww)
super().fit(X_ww, y_ww)
return self

def transform(self, X, y=None):
"""Transforms the input data by Oversampling the data.
Args:
X (pd.DataFrame): Training features.
y (pd.Series): Target.
Returns:
pd.DataFrame, pd.Series: Transformed features and target.
"""
X_ww = infer_feature_types(X)
original_schema = X_ww.ww.schema
if y is None:
raise ValueError("y cannot be None")
y_ww = infer_feature_types(y)
X_d, y_d = self._handle_nullable_types(X_ww, y_ww)
X_t, y_t = super().transform(X_d, y_d)
X_t.ww.init(schema=original_schema)

return X_t, y_t

def _get_best_oversampler(self, X):
cat_cols = X.ww.select(["category", "boolean"]).columns
cat_cols = X.ww.select(
["category", "boolean", "BooleanNullable"],
return_schema=True,
).columns
if len(cat_cols) == X.shape[1]:
return "SMOTEN"
elif not len(cat_cols):
Expand All @@ -101,7 +129,7 @@ def _get_categorical(self, X):
]
# Grab boolean columns, since SMOTE considers these categorical as well
for i, val in enumerate(X.ww.types["Logical Type"].items()):
if str(val[1]) == "Boolean":
if str(val[1]) in {"Boolean", "BooleanNullable"}:
self.categorical_features.append(i)
self._parameters["categorical_features"] = self.categorical_features

Expand All @@ -115,7 +143,7 @@ def _initialize_sampler(self, X, y):
y (pd.Series): Target.
"""
sampler_class = self.sampler
_, y_pd = self._prepare_data(X, y)
y_pd = infer_feature_types(y)
sampler_params = {
k: v
for k, v in self.parameters.items()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ def transform(self, X, y=None):
Returns:
pd.DataFrame, pd.Series: Transformed features and target.
"""
X_ww, y_ww = self._prepare_data(X, y)
X_ww = infer_feature_types(X)
if y is None:
raise ValueError("y cannot be None")
y_ww = infer_feature_types(y)

index_df = pd.Series(y_ww.index)
indices = self.fit_resample(X_ww, y_ww)

Expand Down
10 changes: 8 additions & 2 deletions evalml/tests/component_tests/test_arima_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
import pytest
import woodwork as ww
from sktime.forecasting.arima import AutoARIMA as SKArima
from sktime.forecasting.base import ForecastingHorizon

Expand Down Expand Up @@ -476,11 +477,15 @@ def test_arima_regressor_can_forecast_arbitrary_dates(use_covariates, ts_data):
)


def test_arima_regressor_nullable_handling():
@pytest.mark.parametrize(
"nullable_ltype",
["IntegerNullable", "AgeNullable"],
)
def test_arima_regressor_with_nullable_types(nullable_ltype):
X = pd.DataFrame()
X["nums"] = pd.Series([i for i in range(100)], dtype="Int64")
X.index = pd.date_range("1/1/21", periods=100)
X.ww.init(logical_types={"nums": "IntegerNullable"})
X.ww.init(logical_types={"nums": nullable_ltype})

y = pd.Series([i for i in range(100)], dtype="Int64")
y.index = pd.date_range("1/1/21", periods=100)
Expand All @@ -489,6 +494,7 @@ def test_arima_regressor_nullable_handling():
X_test = X.ww.iloc[80:, :]

y_train = y[:80]
y_train = ww.init_series(y_train, logical_type=nullable_ltype)

arima_params = {
"trend": None,
Expand Down
Loading

0 comments on commit abd16f7

Please sign in to comment.