Change treatment of generic column type object (#1415)

* rename `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/feature_reduction` to `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction`. also rename corresponding feature reduction class FeatureReduction to TextFeatureReduction. `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py:TextFeatureReduction` This includes adapting all *csv and *json participating in metalearning The "real" changes are limited to 1. truncated_svd.py 2. feature_type_text.py * rename `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/feature_reduction` to `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction`. also rename corresponding feature reduction class FeatureReduction to TextFeatureReduction. `auto-sklearn/autosklearn/pipeline/components/data_preprocessing/text_feature_reduction/truncated_svd.py:TextFeatureReduction` This includes adapting all *csv and *json participating in metalearning The "real" changes are limited to 1. truncated_svd.py 2. feature_type_text.py * change treatment of generic column dtype `object` for pandas dataframes. The `object` type will be treated as `string` in the future. * change treatment of generic column dtype `object` for pandas dataframes. The `object` type will be treated as `string` in the future. * change treatment of generic column dtype `object` for pandas dataframes. The `object` type will be treated as `string` in the future. * change treatment of generic column dtype `object` for pandas dataframes. The `object` type will be treated as `string` in the future. add new test case to `test_feature_validator.py` * change treatment of generic column dtype `object` for pandas dataframes. The `object` type will be treated as `string` in the future. add new test case to `test_feature_validator.py` * change treatment of generic column dtype `object` for pandas dataframes. The `object` type will be treated as `string` in the future. add new test case to `test_feature_validator.py`
automl · Mar 15, 2022 · d6b90f1 · d6b90f1
1 parent 457e50c
commit d6b90f1
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 11 deletions.
diff --git a/autosklearn/data/feature_validator.py b/autosklearn/data/feature_validator.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional, Tuple, Union, cast
 
 import logging
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -304,16 +305,13 @@ def get_feat_type_from_columns(
             # TypeError: data type not understood in certain pandas types
             elif not is_numeric_dtype(X[column]):
                 if X[column].dtype.name == "object":
-                    raise ValueError(
-                        f"Input Column {column} has invalid type object. "
-                        "Cast it to a valid dtype before using it in Auto-Sklearn. "
-                        "Valid types are numerical, categorical or boolean. "
-                        "You can cast it to a valid dtype using "
-                        "pandas.Series.astype ."
-                        "If working with string objects, the following "
-                        "tutorial illustrates how to work with text data: "
-                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html"  # noqa: E501
+                    warnings.warn(
+                        f"Input Column {column} has generic type object. "
+                        f"Autosklearn will treat this column as string. "
+                        f"Please ensure that this setting is suitable for your task.",
+                        UserWarning,
                     )
+                    feat_type[column] = "string"
                 elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
                     X[column].dtype
                 ):

diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
@@ -383,7 +383,6 @@ def test_no_new_category_after_fit():
 def test_featurevalidator_new_data_after_fit(
     openml_id, train_data_type, test_data_type
 ):
-
     # List is currently not supported as infer_objects
     # cast list objects to type objects
     if train_data_type == "list" or test_data_type == "list":
@@ -436,7 +435,6 @@ def test_featurevalidator_new_data_after_fit(
     ),
 )
 def test_list_to_dataframe(openml_id):
-
     X_pandas, y_pandas = sklearn.datasets.fetch_openml(
         data_id=openml_id, return_X_y=True, as_frame=True
     )
@@ -509,3 +507,56 @@ def test_unsupported_dataframe_sparse():
         ValueError, match=r"Auto-sklearn does not yet support sparse pandas"
     ):
         validator.fit(df)
+
+
+def test_object_columns():
+    class Dummy:
+        def __init__(self, x):
+            self.x = x
+
+        def __call__(self):
+            print(self.x)
+
+        def dummy_func(self):
+            for i in range(100):
+                print("do something 100 times")
+
+    dummy_object = Dummy(1)
+    lst = [1, 2, 3]
+    array = np.array([1, 2, 3])
+    dummy_stirng = "dummy string"
+
+    df = pd.DataFrame(
+        {
+            "dummy_object": [dummy_object] * 4,
+            "dummy_lst": [lst] * 4,
+            "dummy_array": [array] * 4,
+            "dummy_string": [dummy_stirng] * 4,
+            "type_mix_column": [dummy_stirng, dummy_object, array, lst],
+            "cat_column": ["a", "b", "a", "b"],
+        }
+    )
+    df["cat_column"] = df["cat_column"].astype("category")
+
+    with pytest.warns(
+        UserWarning,
+        match=r"Input Column dummy_object has "
+        r"generic type object. "
+        r"Autosklearn will treat "
+        r"this column as string. "
+        r"Please ensure that this setting "
+        r"is suitable for your task.",
+    ):
+        validator = FeatureValidator()
+        feat_type = validator.get_feat_type_from_columns(df)
+
+    column_types = {
+        "dummy_object": "string",
+        "dummy_lst": "string",
+        "dummy_array": "string",
+        "dummy_string": "string",
+        "type_mix_column": "string",
+        "cat_column": "categorical",
+    }
+
+    assert feat_type == column_types