Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text Processing #1300

Merged
merged 40 commits into from
Feb 3, 2022
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4450d86
commit meta learning data bases
Louquinze Nov 9, 2021
e821eaf
commit changed files
Louquinze Nov 9, 2021
ae4f59f
commit new files
Louquinze Nov 9, 2021
d0a10ab
fixed experimental settings
Louquinze Nov 12, 2021
65271a9
Merge branch 'automl:development' into development
Louquinze Nov 12, 2021
55e87e2
implemented last comments on old PR
Louquinze Nov 17, 2021
590387d
adapted metalearning to last commit
Louquinze Nov 17, 2021
2809c46
add a text preprocessing example
Louquinze Nov 17, 2021
ffe8ccf
intigrated feedback
Louquinze Nov 17, 2021
8094eb5
new changes on *.csv files
Louquinze Nov 17, 2021
1a27144
reset changes
Louquinze Nov 17, 2021
63e6fdb
Merge remote-tracking branch 'origin/development' into development
Louquinze Nov 17, 2021
1a2f66d
add changes for merging
Louquinze Nov 17, 2021
107e854
add changes for merging
Louquinze Nov 17, 2021
88aa101
add changes for merging
Louquinze Nov 17, 2021
11f092f
try to merge
Louquinze Nov 17, 2021
d5a03d6
fixed string representation for metalearning (some sort of hot fix, m…
Louquinze Dec 7, 2021
220807e
fixed string representation for metalearning (some sort of hot fix, m…
Louquinze Dec 7, 2021
38ffd06
fixed string representation for metalearning (some sort of hot fix, m…
Louquinze Dec 7, 2021
fa7c8e7
init
Louquinze Jan 13, 2022
2e1947a
init
Louquinze Jan 13, 2022
b56f05f
commit changes for text preprocessing
Louquinze Jan 13, 2022
0d95435
text prepreprocessing commit
Louquinze Jan 13, 2022
3a00674
fix metalearning
Louquinze Jan 13, 2022
cabdb66
fix metalearning
Louquinze Jan 13, 2022
fdd7007
resolve conflicts
Louquinze Jan 14, 2022
8fe74a4
Merge branch 'automl-development' into development
Louquinze Jan 14, 2022
20caf09
adapted test to new text feature
Louquinze Jan 14, 2022
42a7bdb
fix style guide issues
Louquinze Jan 14, 2022
b7bc8fb
integrate PR comments
Louquinze Jan 19, 2022
e85eb2e
integrate PR comments
Louquinze Jan 19, 2022
cafb1d4
implemented the comments to the last PR
Louquinze Jan 23, 2022
b9da42d
fitted operation is not in place therefore we have to assgin the fitt…
Louquinze Jan 23, 2022
d2d5a24
add first text processing tests
Louquinze Jan 23, 2022
ac40ff9
add first text processing tests
Louquinze Jan 24, 2022
38be7c3
including comments from 01.25.
Louquinze Jan 25, 2022
5f6d6a7
including comments from 01.28.
Louquinze Jan 28, 2022
94b9c27
including comments from 01.28.
Louquinze Jan 28, 2022
bc6e883
including comments from 01.28.
Louquinze Jan 28, 2022
ce1c0d1
including comments from 01.31.
Louquinze Jan 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions autosklearn/data/feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def fit(
))

for ft in self.feat_type.values():
if ft.lower() not in ['categorical', 'numerical']:
raise ValueError('Only `Categorical` and `Numerical` are '
'valid feature types, you passed `%s`' % ft)
if ft.lower() not in ['categorical', 'numerical', 'string']:
raise ValueError('Only `Categorical`, `Numerical` and `String` are '
'valid feature types')

if X_test is not None:
self._check_data(X_test)
Expand Down Expand Up @@ -264,7 +264,7 @@ def get_feat_type_from_columns(
) -> typing.Dict[typing.Union[str, int], str]:
"""
Returns a dictionary that maps pandas dataframe columns to a feature type.
This feature type can be categorical or numerical
This feature type can be categorical, numerical or string

Parameters
----------
Expand All @@ -286,8 +286,9 @@ def get_feat_type_from_columns(
raise ValueError("Auto-sklearn does not yet support sparse pandas Series."
f" Please convert {column} to a dense format.")
elif X[column].dtype.name in ['category', 'bool']:

feat_type[column] = 'categorical'
elif X[column].dtype.name == "string":
feat_type[column] = 'string'
# Move away from np.issubdtype as it causes
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
Expand Down Expand Up @@ -359,12 +360,6 @@ def list_to_dataframe(

# Store the dtypes and use in case of re-fit
if len(self.dtypes) == 0:
# Categorical data is inferred as string. Convert to categorical.
# Warn the user about dtypes or request him to use a dataframe
for col in X_train.columns:
if X_train[col].dtype.name == 'string':
X_train[col] = X_train[col].astype('category')

self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns}
else:
for col in X_train.columns:
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/accuracy_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/accuracy_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/accuracy_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/accuracy_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...learn/metalearning/files/average_precision_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...earn/metalearning/files/average_precision_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...n/metalearning/files/average_precision_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions .../metalearning/files/average_precision_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...learn/metalearning/files/balanced_accuracy_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...earn/metalearning/files/balanced_accuracy_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...n/metalearning/files/balanced_accuracy_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions .../metalearning/files/balanced_accuracy_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_macro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_macro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_macro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_macro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_micro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_micro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_micro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_micro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_samples_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_samples_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_samples_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...sklearn/metalearning/files/f1_samples_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/f1_weighted_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/f1_weighted_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/f1_weighted_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/f1_weighted_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/log_loss_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/log_loss_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/log_loss_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/log_loss_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_absolute_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_absolute_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_log_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/mean_squared_log_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/median_absolute_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/median_absolute_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/precision_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/precision_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/precision_macro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/precision_macro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...arn/metalearning/files/precision_macro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...rn/metalearning/files/precision_macro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/precision_micro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/precision_micro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...arn/metalearning/files/precision_micro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...rn/metalearning/files/precision_micro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/precision_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/precision_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...learn/metalearning/files/precision_samples_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...earn/metalearning/files/precision_samples_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...n/metalearning/files/precision_samples_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions .../metalearning/files/precision_samples_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...earn/metalearning/files/precision_weighted_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...arn/metalearning/files/precision_weighted_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions .../metalearning/files/precision_weighted_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...metalearning/files/precision_weighted_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/r2_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/r2_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_macro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_macro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...klearn/metalearning/files/recall_macro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...learn/metalearning/files/recall_macro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_micro_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_micro_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...klearn/metalearning/files/recall_micro_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...learn/metalearning/files/recall_micro_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/recall_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/recall_samples_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...sklearn/metalearning/files/recall_samples_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...earn/metalearning/files/recall_samples_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...arn/metalearning/files/recall_samples_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...sklearn/metalearning/files/recall_weighted_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...klearn/metalearning/files/recall_weighted_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions ...arn/metalearning/files/recall_weighted_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions ...rn/metalearning/files/recall_weighted_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/roc_auc_binary.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/roc_auc_binary.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

412 changes: 206 additions & 206 deletions autosklearn/metalearning/files/roc_auc_multiclass.classification_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

396 changes: 198 additions & 198 deletions autosklearn/metalearning/files/roc_auc_multiclass.classification_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/root_mean_squared_error_regression_dense/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/root_mean_squared_error_regression_sparse/configurations.csv
100644 → 100755

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions autosklearn/metalearning/metafeatures/metafeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1082,11 +1082,19 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
# TODO make sure this is done as efficient as possible (no copy for
# sparse matrices because of wrong sparse format)
sparse = scipy.sparse.issparse(X)

feat_type = {key: 'categorical' if value else 'numerical'
for key, value in categorical.items()}

# TODO make this more cohesive to the overall structure (quick bug fix)
if isinstance(X, pd.DataFrame):
for key in X.select_dtypes(include="string").columns:
feat_type[key] = "string"

DPP = FeatTypeSplit(
# The difference between feat_type and categorical, is that
# categorical has True/False instead of categorical/numerical
feat_type={key: 'categorical' if value else 'numerical'
for key, value in categorical.items()},
feat_type=feat_type,
force_sparse_output=True)
X_transformed = DPP.fit_transform(X)
categorical_transformed = {i: False for i in range(X_transformed.shape[1])}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Dict, Optional, Tuple, Union

from ConfigSpace.configuration_space import ConfigurationSpace
import ConfigSpace.hyperparameters as CSH

import numpy as np

from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT

from sklearn.decomposition import TruncatedSVD


class FeatureReduction(AutoSklearnPreprocessingAlgorithm):
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
"""
Reduces the features created by a bag of words encoding
"""

def __init__(
self,
n_components: Optional[int] = None,
random_state: Optional[Union[int, np.random.RandomState]] = None
) -> None:
self.n_components = n_components
self.random_state = random_state

def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
) -> 'FeatureReduction':
if X.shape[1] > self.n_components:
self.preprocessor = TruncatedSVD(n_components=self.n_components,
random_state=self.random_state)
elif X.shape[1] <= self.n_components and X.shape[1] != 1:
self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1,
random_state=self.random_state)
else:
raise ValueError("The text embedding consists only of a single dimension.\n"
"Are you sure that your text data is necessary?")
self.preprocessor.fit(X)
return self

def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
if self.preprocessor is None:
raise NotImplementedError()
return self.preprocessor.transform(X)

@staticmethod
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
return {'shortname': 'TextFeatureReduction',
'name': 'TextFeatureReduction',
'handles_missing_values': True,
'handles_nominal_values': True,
'handles_numerical_features': True,
'prefers_data_scaled': False,
'prefers_data_normalized': False,
'handles_regression': True,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'handles_multioutput': True,
'is_deterministic': True,
'handles_sparse': True,
'handles_dense': True,
'input': (DENSE, SPARSE, UNSIGNED_DATA),
'output': (INPUT,),
'preferred_dtype': None}

@staticmethod
def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> ConfigurationSpace:
cs = ConfigurationSpace()
cs.add_hyperparameter(
CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000,
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
default_value=100, log=True))
return cs
57 changes: 34 additions & 23 deletions autosklearn/pipeline/components/data_preprocessing/feature_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import CategoricalPreprocessingPipeline
from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \
import NumericalPreprocessingPipeline
from autosklearn.pipeline.components.data_preprocessing.feature_type_text \
import TextPreprocessingPipeline
from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, \
AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
Expand All @@ -29,8 +31,8 @@


class FeatTypeSplit(AutoSklearnPreprocessingAlgorithm):
""" This component is used to apply distinct transformations to categorical and
numerical features of a dataset. It is built on top of sklearn's ColumnTransformer.
""" This component is used to apply distinct transformations to categorical,
numerical and text features of a dataset. It is built on top of sklearn's ColumnTransformer.
"""

def __init__(
Expand Down Expand Up @@ -82,9 +84,23 @@ def __init__(
config=None, steps=pipeline, dataset_properties=dataset_properties,
include=include, exclude=exclude, random_state=random_state,
init_params=init_params)

# The pipeline that will be applied to the text features (i.e. columns)
# of the dataset
# Configuration of the data-preprocessor is different from the configuration of
# the numerical or categorical pipeline. Hence, force to None
# It is actually the call to set_hyperparameter who properly sets this argument
# TODO: Extract the child configuration space from the FeatTypeSplit to the
# pipeline if needed
self.txt_ppl = TextPreprocessingPipeline(
config=None, steps=pipeline, dataset_properties=dataset_properties,
include=include, exclude=exclude, random_state=random_state,
init_params=init_params)

self._transformers: List[Tuple[str, AutoSklearnComponent]] = [
("categorical_transformer", self.categ_ppl),
("numerical_transformer", self.numer_ppl),
("text_transformer", self.txt_ppl),
]
if self.config:
self.set_hyperparameters(self.config, init_params=init_params)
Expand All @@ -96,6 +112,7 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
n_feats = X.shape[1]
categorical_features = []
numerical_features = []
text_features = []
if self.feat_type is not None:
# Make sure that we are not missing any column!
expected = set(self.feat_type.keys())
Expand All @@ -104,42 +121,36 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
else:
columns = set(range(n_feats))
if expected != columns:
raise ValueError("Train data has columns={} yet the feat_types are feat={}".format(
expected,
columns
))
raise ValueError(f"Train data has columns={expected} yet the"
f" feat_types are feat={columns}")
categorical_features = [key for key, value in self.feat_type.items()
if value.lower() == 'categorical']
numerical_features = [key for key, value in self.feat_type.items()
if value.lower() == 'numerical']
text_features = [key for key, value in self.feat_type.items()
if value.lower() == "string"]

# If no categorical features, assume we have a numerical only pipeline
if len(categorical_features) == 0:
sklearn_transf_spec: List[Tuple[str, BaseEstimator, List[Union[str, bool, int]]]] = [
("numerical_transformer", self.numer_ppl, [True] * n_feats)
]
# If all features are categorical, then just the categorical transformer is used
elif len(numerical_features) == 0:
sklearn_transf_spec = [
("categorical_transformer", self.categ_ppl, [True] * n_feats)
(name, transformer, feature_columns)
for name, transformer, feature_columns
in [
("text_transformer", self.txt_ppl, text_features),
("categorical_transformer", self.categ_ppl, categorical_features),
("numerical_transformer", self.numer_ppl, numerical_features)
]
if len(feature_columns) > 0
]
# For the other cases, both transformers are used
else:
sklearn_transf_spec = [
("categorical_transformer", self.categ_ppl, categorical_features),
("numerical_transformer", self.numer_ppl, numerical_features)
]
# self.feature_type == None assumes numerical case
sklearn_transf_spec = [("numerical_transformer", self.numer_ppl, [True]*n_feats)]

# And one last check in case feat type is None
# And to make sure the final specification has all the columns
# considered in the column transformer
total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec])
if total_columns != n_feats:
raise ValueError("Missing columns in the specification of the data validator"
" for train data={} and spec={}".format(
np.shape(X),
sklearn_transf_spec,
))
f" for train data={np.shape(X)} and spec={sklearn_transf_spec}")

self.sparse_ = sparse.issparse(X) or self.force_sparse_output
self.column_transformer = sklearn.compose.ColumnTransformer(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import Any, List, Dict, Optional, Tuple, Union

from ConfigSpace.configuration_space import Configuration, ConfigurationSpace

import numpy as np

from sklearn.base import BaseEstimator

from autosklearn.pipeline.components.data_preprocessing.text_encoding \
import BagOfWordChoice
from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \
FeatureReduction
from autosklearn.pipeline.base import (
BasePipeline,
DATASET_PROPERTIES_TYPE,
)
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT


class TextPreprocessingPipeline(BasePipeline):
"""This class implements a pipeline for data preprocessing of text features.
It assumes that the data to be transformed is made only of text features.
The steps of this pipeline are:
1 - Vectorize: Fits a *Vecotrizer object and apply this
2 - text feature reduction: TruncatedSVD

Parameters
----------
config : ConfigSpace.configuration_space.Configuration
The configuration to evaluate.

random_state : Optional[int | RandomState]
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance
used by `np.random`."""
eddiebergman marked this conversation as resolved.
Show resolved Hide resolved

def __init__(
self,
config: Optional[Configuration] = None,
steps: Optional[List[Tuple[str, BaseEstimator]]] = None,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
random_state: Optional[Union[int, np.random.RandomState]] = None,
init_params: Optional[Dict[str, Any]] = None
) -> None:
self._output_dtype = np.int32
super().__init__(
config, steps, dataset_properties, include, exclude,
random_state, init_params
)

@staticmethod
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
return {'shortname': 'txt_datapreproc',
'name': 'text data preprocessing',
'handles_missing_values': True,
'handles_nominal_values': False,
'handles_numerical_features': False,
'prefers_data_scaled': False,
'prefers_data_normalized': False,
'handles_regression': True,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'is_deterministic': True,
'handles_sparse': True,
'handles_dense': True,
'input': (DENSE, SPARSE, UNSIGNED_DATA),
'output': (INPUT,),
'preferred_dtype': None}

def _get_hyperparameter_search_space(
self,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
) -> ConfigurationSpace:
"""Create the hyperparameter configuration space.

Parameters
----------
eddiebergman marked this conversation as resolved.
Show resolved Hide resolved
# TODO add parameter description

Returns
-------
cs : ConfigSpace.configuration_space.Configuration
The configuration space describing the SimpleRegressionClassifier.
"""
cs = ConfigurationSpace()
if dataset_properties is None or not isinstance(dataset_properties, dict):
dataset_properties = dict()

cs = self._get_base_search_space(
cs=cs, dataset_properties=dataset_properties,
exclude=exclude, include=include, pipeline=self.steps)

return cs

def _get_pipeline_steps(self,
dataset_properties: Optional[Dict[str, str]] = None,
) -> List[Tuple[str, BaseEstimator]]:
steps = []

default_dataset_properties = {}
if dataset_properties is not None and isinstance(dataset_properties, dict):
default_dataset_properties.update(dataset_properties)

steps.extend([
("text_encoding", BagOfWordChoice(default_dataset_properties,
random_state=self.random_state)),
("feature_reduction", FeatureReduction(random_state=self.random_state))
])
return steps

def _get_estimator_hyperparameter_name(self) -> str:
return "text data preprocessing"
Loading