Skip to content

Commit

Permalink
Change treatment of generic column type object (#1415)
Browse files Browse the repository at this point in the history
* rename `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/feature_reduction` to `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction`.

also rename corresponding feature reduction class FeatureReduction to TextFeatureReduction.
`auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py:TextFeatureReduction`

This includes adapting all *csv and *json participating in metalearning

The "real" changes are limited to
  1. truncated_svd.py
  2. feature_type_text.py

* rename `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/feature_reduction` to `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction`.

also rename corresponding feature reduction class FeatureReduction to TextFeatureReduction.
`auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py:TextFeatureReduction`

This includes adapting all *csv and *json participating in metalearning

The "real" changes are limited to
  1. truncated_svd.py
  2. feature_type_text.py

* change treatment of generic column dtype `object` for pandas dataframes.
The `object` type will be treated as `string` in the future.

* change treatment of generic column dtype `object` for pandas dataframes.
The `object` type will be treated as `string` in the future.

* change treatment of generic column dtype `object` for pandas dataframes.
The `object` type will be treated as `string` in the future.

* change treatment of generic column dtype `object` for pandas dataframes.
The `object` type will be treated as `string` in the future.

add new test case to `test_feature_validator.py`

* change treatment of generic column dtype `object` for pandas dataframes.
The `object` type will be treated as `string` in the future.

add new test case to `test_feature_validator.py`

* change treatment of generic column dtype `object` for pandas dataframes.
The `object` type will be treated as `string` in the future.

add new test case to `test_feature_validator.py`
  • Loading branch information
Louquinze authored Mar 15, 2022
1 parent 457e50c commit d6b90f1
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 11 deletions.
16 changes: 7 additions & 9 deletions autosklearn/data/feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List, Optional, Tuple, Union, cast

import logging
import warnings

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -304,16 +305,13 @@ def get_feat_type_from_columns(
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
if X[column].dtype.name == "object":
raise ValueError(
f"Input Column {column} has invalid type object. "
"Cast it to a valid dtype before using it in Auto-Sklearn. "
"Valid types are numerical, categorical or boolean. "
"You can cast it to a valid dtype using "
"pandas.Series.astype ."
"If working with string objects, the following "
"tutorial illustrates how to work with text data: "
"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html" # noqa: E501
warnings.warn(
f"Input Column {column} has generic type object. "
f"Autosklearn will treat this column as string. "
f"Please ensure that this setting is suitable for your task.",
UserWarning,
)
feat_type[column] = "string"
elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
X[column].dtype
):
Expand Down
55 changes: 53 additions & 2 deletions test/test_data/test_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,6 @@ def test_no_new_category_after_fit():
def test_featurevalidator_new_data_after_fit(
openml_id, train_data_type, test_data_type
):

# List is currently not supported as infer_objects
# cast list objects to type objects
if train_data_type == "list" or test_data_type == "list":
Expand Down Expand Up @@ -436,7 +435,6 @@ def test_featurevalidator_new_data_after_fit(
),
)
def test_list_to_dataframe(openml_id):

X_pandas, y_pandas = sklearn.datasets.fetch_openml(
data_id=openml_id, return_X_y=True, as_frame=True
)
Expand Down Expand Up @@ -509,3 +507,56 @@ def test_unsupported_dataframe_sparse():
ValueError, match=r"Auto-sklearn does not yet support sparse pandas"
):
validator.fit(df)


def test_object_columns():
class Dummy:
def __init__(self, x):
self.x = x

def __call__(self):
print(self.x)

def dummy_func(self):
for i in range(100):
print("do something 100 times")

dummy_object = Dummy(1)
lst = [1, 2, 3]
array = np.array([1, 2, 3])
dummy_stirng = "dummy string"

df = pd.DataFrame(
{
"dummy_object": [dummy_object] * 4,
"dummy_lst": [lst] * 4,
"dummy_array": [array] * 4,
"dummy_string": [dummy_stirng] * 4,
"type_mix_column": [dummy_stirng, dummy_object, array, lst],
"cat_column": ["a", "b", "a", "b"],
}
)
df["cat_column"] = df["cat_column"].astype("category")

with pytest.warns(
UserWarning,
match=r"Input Column dummy_object has "
r"generic type object. "
r"Autosklearn will treat "
r"this column as string. "
r"Please ensure that this setting "
r"is suitable for your task.",
):
validator = FeatureValidator()
feat_type = validator.get_feat_type_from_columns(df)

column_types = {
"dummy_object": "string",
"dummy_lst": "string",
"dummy_array": "string",
"dummy_string": "string",
"type_mix_column": "string",
"cat_column": "categorical",
}

assert feat_type == column_types

0 comments on commit d6b90f1

Please sign in to comment.