Skip to content

Commit

Permalink
Text Processing (#1300)
Browse files Browse the repository at this point in the history
* commit meta learning data bases

* commit changed files

* commit new files

* fixed experimental settings

* implemented last comments on old PR

* adapted metalearning to last commit

* add a text preprocessing example

* intigrated feedback

* new changes on *.csv files

* reset changes

* add changes for merging

* add changes for merging

* add changes for merging

* try to merge

* fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale)

* fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale)

* fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale)

* init

* init

* commit changes for text preprocessing

* text prepreprocessing commit

* fix metalearning

* fix metalearning

* adapted test to new text feature

* fix style guide issues

* integrate PR comments

* integrate PR comments

* implemented the comments to the last PR

* fitted operation is not in place therefore we have to assgin the fitted self.preprocessor again to it self

* add first text processing tests

* add first text processing tests

* including comments from 01.25.

* including comments from 01.28.

* including comments from 01.28.

* including comments from 01.28.

* including comments from 01.31.
  • Loading branch information
Louquinze authored and eddiebergman committed Aug 18, 2022
1 parent 43299a9 commit 1e17a07
Show file tree
Hide file tree
Showing 133 changed files with 22,962 additions and 18,279 deletions.
17 changes: 6 additions & 11 deletions autosklearn/data/feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ def fit(
))

for ft in self.feat_type.values():
if ft.lower() not in ['categorical', 'numerical']:
raise ValueError('Only `Categorical` and `Numerical` are '
'valid feature types, you passed `%s`' % ft)
if ft.lower() not in ['categorical', 'numerical', 'string']:
raise ValueError('Only `Categorical`, `Numerical` and `String` are '
'valid feature types')

if X_test is not None:
self._check_data(X_test)
Expand Down Expand Up @@ -262,7 +262,7 @@ def get_feat_type_from_columns(
) -> Dict[Union[str, int], str]:
"""
Returns a dictionary that maps pandas dataframe columns to a feature type.
This feature type can be categorical or numerical
This feature type can be categorical, numerical or string
Parameters
----------
Expand All @@ -284,8 +284,9 @@ def get_feat_type_from_columns(
raise ValueError("Auto-sklearn does not yet support sparse pandas Series."
f" Please convert {column} to a dense format.")
elif X[column].dtype.name in ['category', 'bool']:

feat_type[column] = 'categorical'
elif X[column].dtype.name == "string":
feat_type[column] = 'string'
# Move away from np.issubdtype as it causes
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
Expand Down Expand Up @@ -357,12 +358,6 @@ def list_to_dataframe(

# Store the dtypes and use in case of re-fit
if len(self.dtypes) == 0:
# Categorical data is inferred as string. Convert to categorical.
# Warn the user about dtypes or request him to use a dataframe
for col in X_train.columns:
if X_train[col].dtype.name == 'string':
X_train[col] = X_train[col].astype('category')

self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns}
else:
for col in X_train.columns:
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/accuracy_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/accuracy_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/accuracy_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/accuracy_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...learn/metalearning/files/average_precision_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...earn/metalearning/files/average_precision_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...n/metalearning/files/average_precision_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions .../metalearning/files/average_precision_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...learn/metalearning/files/balanced_accuracy_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...earn/metalearning/files/balanced_accuracy_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...n/metalearning/files/balanced_accuracy_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions .../metalearning/files/balanced_accuracy_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_macro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_macro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_macro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_macro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_micro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_micro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_micro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_micro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_samples_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_samples_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_samples_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...sklearn/metalearning/files/f1_samples_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_weighted_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_weighted_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/f1_weighted_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/f1_weighted_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/log_loss_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/log_loss_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/log_loss_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/log_loss_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_absolute_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_absolute_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_log_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_log_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/median_absolute_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/median_absolute_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/precision_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/precision_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/precision_macro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/precision_macro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...arn/metalearning/files/precision_macro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...rn/metalearning/files/precision_macro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/precision_micro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/precision_micro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...arn/metalearning/files/precision_micro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...rn/metalearning/files/precision_micro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/precision_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/precision_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...learn/metalearning/files/precision_samples_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...earn/metalearning/files/precision_samples_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...n/metalearning/files/precision_samples_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions .../metalearning/files/precision_samples_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...earn/metalearning/files/precision_weighted_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...arn/metalearning/files/precision_weighted_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions .../metalearning/files/precision_weighted_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...metalearning/files/precision_weighted_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/r2_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/r2_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_macro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_macro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...klearn/metalearning/files/recall_macro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...learn/metalearning/files/recall_macro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_micro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_micro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...klearn/metalearning/files/recall_micro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...learn/metalearning/files/recall_micro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_samples_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...sklearn/metalearning/files/recall_samples_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...earn/metalearning/files/recall_samples_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...arn/metalearning/files/recall_samples_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/recall_weighted_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/recall_weighted_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...arn/metalearning/files/recall_weighted_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...rn/metalearning/files/recall_weighted_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/roc_auc_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/roc_auc_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/roc_auc_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/roc_auc_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/root_mean_squared_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/root_mean_squared_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions autosklearn/metalearning/metafeatures/metafeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1082,11 +1082,19 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
# TODO make sure this is done as efficient as possible (no copy for
# sparse matrices because of wrong sparse format)
sparse = scipy.sparse.issparse(X)

feat_type = {key: 'categorical' if value else 'numerical'
for key, value in categorical.items()}

# TODO make this more cohesive to the overall structure (quick bug fix)
if isinstance(X, pd.DataFrame):
for key in X.select_dtypes(include="string").columns:
feat_type[key] = "string"

DPP = FeatTypeSplit(
# The difference between feat_type and categorical, is that
# categorical has True/False instead of categorical/numerical
feat_type={key: 'categorical' if value else 'numerical'
for key, value in categorical.items()},
feat_type=feat_type,
force_sparse_output=True)
X_transformed = DPP.fit_transform(X)
categorical_transformed = {i: False for i in range(X_transformed.shape[1])}
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Dict, Optional, Tuple, Union

from ConfigSpace.configuration_space import ConfigurationSpace
import ConfigSpace.hyperparameters as CSH

import numpy as np

from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT

from sklearn.decomposition import TruncatedSVD


class FeatureReduction(AutoSklearnPreprocessingAlgorithm):
"""
Reduces the features created by a bag of words encoding
"""

def __init__(
self,
n_components: Optional[int] = None,
random_state: Optional[Union[int, np.random.RandomState]] = None
) -> None:
self.n_components = n_components
self.random_state = random_state

def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
) -> 'FeatureReduction':
if X.shape[1] > self.n_components:
self.preprocessor = TruncatedSVD(n_components=self.n_components,
random_state=self.random_state)
elif X.shape[1] <= self.n_components and X.shape[1] != 1:
self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1,
random_state=self.random_state)
else:
raise ValueError("The text embedding consists only of a single dimension.\n"
"Are you sure that your text data is necessary?")
self.preprocessor.fit(X)
return self

def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
if self.preprocessor is None:
raise NotImplementedError()
return self.preprocessor.transform(X)

@staticmethod
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
return {'shortname': 'TextFeatureReduction',
'name': 'TextFeatureReduction',
'handles_missing_values': True,
'handles_nominal_values': True,
'handles_numerical_features': True,
'prefers_data_scaled': False,
'prefers_data_normalized': False,
'handles_regression': True,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'handles_multioutput': True,
'is_deterministic': True,
'handles_sparse': True,
'handles_dense': True,
'input': (DENSE, SPARSE, UNSIGNED_DATA),
'output': (INPUT,),
'preferred_dtype': None}

@staticmethod
def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> ConfigurationSpace:
cs = ConfigurationSpace()
cs.add_hyperparameter(
CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000,
default_value=100, log=True))
return cs
57 changes: 34 additions & 23 deletions autosklearn/pipeline/components/data_preprocessing/feature_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import CategoricalPreprocessingPipeline
from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \
import NumericalPreprocessingPipeline
from autosklearn.pipeline.components.data_preprocessing.feature_type_text \
import TextPreprocessingPipeline
from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, \
AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
Expand All @@ -29,8 +31,8 @@


class FeatTypeSplit(AutoSklearnPreprocessingAlgorithm):
""" This component is used to apply distinct transformations to categorical and
numerical features of a dataset. It is built on top of sklearn's ColumnTransformer.
""" This component is used to apply distinct transformations to categorical,
numerical and text features of a dataset. It is built on top of sklearn's ColumnTransformer.
"""

def __init__(
Expand Down Expand Up @@ -82,9 +84,23 @@ def __init__(
config=None, steps=pipeline, dataset_properties=dataset_properties,
include=include, exclude=exclude, random_state=random_state,
init_params=init_params)

# The pipeline that will be applied to the text features (i.e. columns)
# of the dataset
# Configuration of the data-preprocessor is different from the configuration of
# the numerical or categorical pipeline. Hence, force to None
# It is actually the call to set_hyperparameter who properly sets this argument
# TODO: Extract the child configuration space from the FeatTypeSplit to the
# pipeline if needed
self.txt_ppl = TextPreprocessingPipeline(
config=None, steps=pipeline, dataset_properties=dataset_properties,
include=include, exclude=exclude, random_state=random_state,
init_params=init_params)

self._transformers: List[Tuple[str, AutoSklearnComponent]] = [
("categorical_transformer", self.categ_ppl),
("numerical_transformer", self.numer_ppl),
("text_transformer", self.txt_ppl),
]
if self.config:
self.set_hyperparameters(self.config, init_params=init_params)
Expand All @@ -96,6 +112,7 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
n_feats = X.shape[1]
categorical_features = []
numerical_features = []
text_features = []
if self.feat_type is not None:
# Make sure that we are not missing any column!
expected = set(self.feat_type.keys())
Expand All @@ -104,42 +121,36 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
else:
columns = set(range(n_feats))
if expected != columns:
raise ValueError("Train data has columns={} yet the feat_types are feat={}".format(
expected,
columns
))
raise ValueError(f"Train data has columns={expected} yet the"
f" feat_types are feat={columns}")
categorical_features = [key for key, value in self.feat_type.items()
if value.lower() == 'categorical']
numerical_features = [key for key, value in self.feat_type.items()
if value.lower() == 'numerical']
text_features = [key for key, value in self.feat_type.items()
if value.lower() == "string"]

# If no categorical features, assume we have a numerical only pipeline
if len(categorical_features) == 0:
sklearn_transf_spec: List[Tuple[str, BaseEstimator, List[Union[str, bool, int]]]] = [
("numerical_transformer", self.numer_ppl, [True] * n_feats)
]
# If all features are categorical, then just the categorical transformer is used
elif len(numerical_features) == 0:
sklearn_transf_spec = [
("categorical_transformer", self.categ_ppl, [True] * n_feats)
(name, transformer, feature_columns)
for name, transformer, feature_columns
in [
("text_transformer", self.txt_ppl, text_features),
("categorical_transformer", self.categ_ppl, categorical_features),
("numerical_transformer", self.numer_ppl, numerical_features)
]
if len(feature_columns) > 0
]
# For the other cases, both transformers are used
else:
sklearn_transf_spec = [
("categorical_transformer", self.categ_ppl, categorical_features),
("numerical_transformer", self.numer_ppl, numerical_features)
]
# self.feature_type == None assumes numerical case
sklearn_transf_spec = [("numerical_transformer", self.numer_ppl, [True]*n_feats)]

# And one last check in case feat type is None
# And to make sure the final specification has all the columns
# considered in the column transformer
total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec])
if total_columns != n_feats:
raise ValueError("Missing columns in the specification of the data validator"
" for train data={} and spec={}".format(
np.shape(X),
sklearn_transf_spec,
))
f" for train data={np.shape(X)} and spec={sklearn_transf_spec}")

self.sparse_ = sparse.issparse(X) or self.force_sparse_output
self.column_transformer = sklearn.compose.ColumnTransformer(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import Any, List, Dict, Optional, Tuple, Union

from ConfigSpace.configuration_space import Configuration, ConfigurationSpace

import numpy as np

from sklearn.base import BaseEstimator

from autosklearn.pipeline.components.data_preprocessing.text_encoding \
import BagOfWordChoice
from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \
FeatureReduction
from autosklearn.pipeline.base import (
BasePipeline,
DATASET_PROPERTIES_TYPE,
)
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT


class TextPreprocessingPipeline(BasePipeline):
"""This class implements a pipeline for data preprocessing of text features.
It assumes that the data to be transformed is made only of text features.
The steps of this pipeline are:
1 - Vectorize: Fits a *Vecotrizer object and apply this
2 - text feature reduction: TruncatedSVD
Parameters
----------
config : ConfigSpace.configuration_space.Configuration
The configuration to evaluate.
random_state : Optional[int | RandomState]
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance
used by `np.random`."""

def __init__(
self,
config: Optional[Configuration] = None,
steps: Optional[List[Tuple[str, BaseEstimator]]] = None,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
random_state: Optional[Union[int, np.random.RandomState]] = None,
init_params: Optional[Dict[str, Any]] = None
) -> None:
self._output_dtype = np.int32
super().__init__(
config, steps, dataset_properties, include, exclude,
random_state, init_params
)

@staticmethod
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
return {'shortname': 'txt_datapreproc',
'name': 'text data preprocessing',
'handles_missing_values': True,
'handles_nominal_values': False,
'handles_numerical_features': False,
'prefers_data_scaled': False,
'prefers_data_normalized': False,
'handles_regression': True,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'is_deterministic': True,
'handles_sparse': True,
'handles_dense': True,
'input': (DENSE, SPARSE, UNSIGNED_DATA),
'output': (INPUT,),
'preferred_dtype': None}

def _get_hyperparameter_search_space(
self,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
) -> ConfigurationSpace:
"""Create the hyperparameter configuration space.
Parameters
----------
# TODO add parameter description
Returns
-------
cs : ConfigSpace.configuration_space.Configuration
The configuration space describing the SimpleRegressionClassifier.
"""
cs = ConfigurationSpace()
if dataset_properties is None or not isinstance(dataset_properties, dict):
dataset_properties = dict()

cs = self._get_base_search_space(
cs=cs, dataset_properties=dataset_properties,
exclude=exclude, include=include, pipeline=self.steps)

return cs

def _get_pipeline_steps(self,
dataset_properties: Optional[Dict[str, str]] = None,
) -> List[Tuple[str, BaseEstimator]]:
steps = []

default_dataset_properties = {}
if dataset_properties is not None and isinstance(dataset_properties, dict):
default_dataset_properties.update(dataset_properties)

steps.extend([
("text_encoding", BagOfWordChoice(default_dataset_properties,
random_state=self.random_state)),
("feature_reduction", FeatureReduction(random_state=self.random_state))
])
return steps

def _get_estimator_hyperparameter_name(self) -> str:
return "text data preprocessing"
Loading

0 comments on commit 1e17a07

Please sign in to comment.