Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text Processing #1300

Merged
merged 40 commits into from
Feb 3, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4450d86
commit meta learning data bases
Louquinze Nov 9, 2021
e821eaf
commit changed files
Louquinze Nov 9, 2021
ae4f59f
commit new files
Louquinze Nov 9, 2021
d0a10ab
fixed experimental settings
Louquinze Nov 12, 2021
65271a9
Merge branch 'automl:development' into development
Louquinze Nov 12, 2021
55e87e2
implemented last comments on old PR
Louquinze Nov 17, 2021
590387d
adapted metalearning to last commit
Louquinze Nov 17, 2021
2809c46
add a text preprocessing example
Louquinze Nov 17, 2021
ffe8ccf
intigrated feedback
Louquinze Nov 17, 2021
8094eb5
new changes on *.csv files
Louquinze Nov 17, 2021
1a27144
reset changes
Louquinze Nov 17, 2021
63e6fdb
Merge remote-tracking branch 'origin/development' into development
Louquinze Nov 17, 2021
1a2f66d
add changes for merging
Louquinze Nov 17, 2021
107e854
add changes for merging
Louquinze Nov 17, 2021
88aa101
add changes for merging
Louquinze Nov 17, 2021
11f092f
try to merge
Louquinze Nov 17, 2021
d5a03d6
fixed string representation for metalearning (some sort of hot fix, m…
Louquinze Dec 7, 2021
220807e
fixed string representation for metalearning (some sort of hot fix, m…
Louquinze Dec 7, 2021
38ffd06
fixed string representation for metalearning (some sort of hot fix, m…
Louquinze Dec 7, 2021
fa7c8e7
init
Louquinze Jan 13, 2022
2e1947a
init
Louquinze Jan 13, 2022
b56f05f
commit changes for text preprocessing
Louquinze Jan 13, 2022
0d95435
text prepreprocessing commit
Louquinze Jan 13, 2022
3a00674
fix metalearning
Louquinze Jan 13, 2022
cabdb66
fix metalearning
Louquinze Jan 13, 2022
fdd7007
resolve conflicts
Louquinze Jan 14, 2022
8fe74a4
Merge branch 'automl-development' into development
Louquinze Jan 14, 2022
20caf09
adapted test to new text feature
Louquinze Jan 14, 2022
42a7bdb
fix style guide issues
Louquinze Jan 14, 2022
b7bc8fb
integrate PR comments
Louquinze Jan 19, 2022
e85eb2e
integrate PR comments
Louquinze Jan 19, 2022
cafb1d4
implemented the comments to the last PR
Louquinze Jan 23, 2022
b9da42d
fitted operation is not in place therefore we have to assgin the fitt…
Louquinze Jan 23, 2022
d2d5a24
add first text processing tests
Louquinze Jan 23, 2022
ac40ff9
add first text processing tests
Louquinze Jan 24, 2022
38be7c3
including comments from 01.25.
Louquinze Jan 25, 2022
5f6d6a7
including comments from 01.28.
Louquinze Jan 28, 2022
94b9c27
including comments from 01.28.
Louquinze Jan 28, 2022
bc6e883
including comments from 01.28.
Louquinze Jan 28, 2022
ce1c0d1
including comments from 01.31.
Louquinze Jan 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions autosklearn/data/feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def fit(
))

for ft in self.feat_type.values():
if ft.lower() not in ['categorical', 'numerical']:
raise ValueError('Only `Categorical` and `Numerical` are '
'valid feature types, you passed `%s`' % ft)
if ft.lower() not in ['categorical', 'numerical', 'string']:
raise ValueError('Only `Categorical`, `Numerical` and `String` are '
'valid feature types')

if X_test is not None:
self._check_data(X_test)
Expand Down Expand Up @@ -264,7 +264,7 @@ def get_feat_type_from_columns(
) -> typing.Dict[typing.Union[str, int], str]:
"""
Returns a dictionary that maps pandas dataframe columns to a feature type.
This feature type can be categorical or numerical
This feature type can be categorical, numerical or string

Parameters
----------
Expand All @@ -288,6 +288,9 @@ def get_feat_type_from_columns(
elif X[column].dtype.name in ['category', 'bool']:

feat_type[column] = 'categorical'
# new string option
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
elif X[column].dtype.name == "string":
feat_type[column] = 'string'
# Move away from np.issubdtype as it causes
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
Expand Down Expand Up @@ -361,9 +364,10 @@ def list_to_dataframe(
if len(self.dtypes) == 0:
# Categorical data is inferred as string. Convert to categorical.
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
# Warn the user about dtypes or request him to use a dataframe
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
# Todo check if this loop is still needed
for col in X_train.columns:
if X_train[col].dtype.name == 'string':
X_train[col] = X_train[col].astype('category')
X_train[col] = X_train[col].astype('string')

self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns}
else:
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/r2_regression_dense/configurations.csv

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions autosklearn/metalearning/files/r2_regression_sparse/configurations.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions autosklearn/metalearning/metalearning/meta_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def __init__(self, configuration_space, aslib_directory, logger):
for algorithm_id in self.configurations:
configuration = self.configurations[algorithm_id]
try:
# ToDo check if all stored configurations are actually readable
# (999, instead of 0.999)
configurations[str(algorithm_id)] = \
(Configuration(configuration_space, values=configuration))
except (ValueError, KeyError) as e:
Expand Down
57 changes: 34 additions & 23 deletions autosklearn/pipeline/components/data_preprocessing/feature_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import CategoricalPreprocessingPipeline
from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \
import NumericalPreprocessingPipeline
from autosklearn.pipeline.components.data_preprocessing.feature_type_text \
import TextPreprocessingPipeline
from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, \
AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
Expand All @@ -29,8 +31,8 @@


class FeatTypeSplit(AutoSklearnPreprocessingAlgorithm):
""" This component is used to apply distinct transformations to categorical and
numerical features of a dataset. It is built on top of sklearn's ColumnTransformer.
""" This component is used to apply distinct transformations to categorical,
numerical and text features of a dataset. It is built on top of sklearn's ColumnTransformer.
"""

def __init__(
Expand Down Expand Up @@ -82,9 +84,23 @@ def __init__(
config=None, steps=pipeline, dataset_properties=dataset_properties,
include=include, exclude=exclude, random_state=random_state,
init_params=init_params)

# The pipeline that will be applied to the text features (i.e. columns)
# of the dataset
# Configuration of the data-preprocessor is different from the configuration of
# the numerical or categorical pipeline. Hence, force to None
# It is actually the call to set_hyperparameter who properly sets this argument
# TODO: Extract the child configuration space from the FeatTypeSplit to the
# pipeline if needed
self.txt_ppl = TextPreprocessingPipeline(
config=None, steps=pipeline, dataset_properties=dataset_properties,
include=include, exclude=exclude, random_state=random_state,
init_params=init_params)

self._transformers: List[Tuple[str, AutoSklearnComponent]] = [
("categorical_transformer", self.categ_ppl),
("numerical_transformer", self.numer_ppl),
("text_transformer", self.txt_ppl),
]
if self.config:
self.set_hyperparameters(self.config, init_params=init_params)
Expand All @@ -96,6 +112,7 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
n_feats = X.shape[1]
categorical_features = []
numerical_features = []
text_features = []
if self.feat_type is not None:
# Make sure that we are not missing any column!
expected = set(self.feat_type.keys())
Expand All @@ -104,42 +121,36 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
else:
columns = set(range(n_feats))
if expected != columns:
raise ValueError("Train data has columns={} yet the feat_types are feat={}".format(
expected,
columns
))
raise ValueError(f"Train data has columns={expected} yet the"
f" feat_types are feat={columns}")
categorical_features = [key for key, value in self.feat_type.items()
if value.lower() == 'categorical']
numerical_features = [key for key, value in self.feat_type.items()
if value.lower() == 'numerical']
text_features = [key for key, value in self.feat_type.items()
if value.lower() == "string"]

# If no categorical features, assume we have a numerical only pipeline
if len(categorical_features) == 0:
sklearn_transf_spec: List[Tuple[str, BaseEstimator, List[Union[str, bool, int]]]] = [
("numerical_transformer", self.numer_ppl, [True] * n_feats)
]
# If all features are categorical, then just the categorical transformer is used
elif len(numerical_features) == 0:
sklearn_transf_spec = [
("categorical_transformer", self.categ_ppl, [True] * n_feats)
(name, transformer, feature_columns)
for name, transformer, feature_columns
in [
("text_transformer", self.txt_ppl, text_features),
("categorical_transformer", self.categ_ppl, categorical_features),
("numerical_transformer", self.numer_ppl, numerical_features)
]
if len(feature_columns) > 0
]
# For the other cases, both transformers are used
else:
sklearn_transf_spec = [
("categorical_transformer", self.categ_ppl, categorical_features),
("numerical_transformer", self.numer_ppl, numerical_features)
]
# self.feature_type == None assumes numerical case
sklearn_transf_spec = [("numerical_transformer", self.numer_ppl, [True]*n_feats)]

# And one last check in case feat type is None
# And to make sure the final specification has all the columns
# considered in the column transformer
total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec])
if total_columns != n_feats:
raise ValueError("Missing columns in the specification of the data validator"
" for train data={} and spec={}".format(
np.shape(X),
sklearn_transf_spec,
))
f" for train data={np.shape(X)} and spec={sklearn_transf_spec}")

self.sparse_ = sparse.issparse(X) or self.force_sparse_output
self.column_transformer = sklearn.compose.ColumnTransformer(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from typing import Any, List, Dict, Optional, Tuple, Union

from ConfigSpace.configuration_space import Configuration, ConfigurationSpace

import numpy as np

from sklearn.base import BaseEstimator

from autosklearn.pipeline.components.data_preprocessing.text_encoding \
import BagOfWordChoice
from autosklearn.pipeline.components.data_preprocessing.text_feature_reduction.truncated_svd import \
TextFeatureReduction
from autosklearn.pipeline.base import (
BasePipeline,
DATASET_PROPERTIES_TYPE,
)
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT


class TextPreprocessingPipeline(BasePipeline):
"""This class implements a pipeline for data preprocessing of text features.
It assumes that the data to be transformed is made only of text features.
The steps of this pipeline are:
1 - Vectorize: Fits a *Vecotrizer object and apply this
2 - text feature reduction: TruncatedSVD

Parameters
----------
config : ConfigSpace.configuration_space.Configuration
The configuration to evaluate.

random_state : Optional[int | RandomState]
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance
used by `np.random`."""
eddiebergman marked this conversation as resolved.
Show resolved Hide resolved

def __init__(
self,
config: Optional[Configuration] = None,
steps: Optional[List[Tuple[str, BaseEstimator]]] = None,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
random_state: Optional[Union[int, np.random.RandomState]] = None,
init_params: Optional[Dict[str, Any]] = None
) -> None:
self._output_dtype = np.int32
super().__init__(
config, steps, dataset_properties, include, exclude,
random_state, init_params
)

@staticmethod
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
return {'shortname': 'txt_datapreproc',
'name': 'text data preprocessing',
'handles_missing_values': True,
'handles_nominal_values': False,
'handles_numerical_features': False,
'prefers_data_scaled': False,
'prefers_data_normalized': False,
'handles_regression': True,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'is_deterministic': True,
'handles_sparse': True,
'handles_dense': True,
'input': (DENSE, SPARSE, UNSIGNED_DATA),
'output': (INPUT,),
'preferred_dtype': None}

def _get_hyperparameter_search_space(
self,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
) -> ConfigurationSpace:
"""Create the hyperparameter configuration space.

Parameters
----------
eddiebergman marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
cs : ConfigSpace.configuration_space.Configuration
The configuration space describing the SimpleRegressionClassifier.
"""
cs = ConfigurationSpace()
if dataset_properties is None or not isinstance(dataset_properties, dict):
dataset_properties = dict()

cs = self._get_base_search_space(
cs=cs, dataset_properties=dataset_properties,
exclude=exclude, include=include, pipeline=self.steps)

return cs

def _get_pipeline_steps(self,
dataset_properties: Optional[Dict[str, str]] = None,
) -> List[Tuple[str, BaseEstimator]]:
steps = []

default_dataset_properties = {}
if dataset_properties is not None and isinstance(dataset_properties, dict):
default_dataset_properties.update(dataset_properties)

# ToDo implemenent the feature reduction
steps.extend([
("text_encoding", BagOfWordChoice(default_dataset_properties)),
("text_feature_reduction", TextFeatureReduction())
])
return steps

def _get_estimator_hyperparameter_name(self) -> str:
return "text data preprocessing"
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from collections import OrderedDict
import os

from typing import Any, Dict, Optional

from ConfigSpace import Configuration
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter

from sklearn.base import BaseEstimator

from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \
ThirdPartyComponents, AutoSklearnChoice

from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE

bow_directory = os.path.split(__file__)[0]
_bows = find_components(__package__,
bow_directory,
AutoSklearnPreprocessingAlgorithm)
_addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
eddiebergman marked this conversation as resolved.
Show resolved Hide resolved


def add_bow(bow: 'BagOfWordChoice') -> None:
_addons.add_component(bow)


class BagOfWordChoice(AutoSklearnChoice):
eddiebergman marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]:
components: Dict[str, BaseEstimator] = OrderedDict()
components.update(_bows)
components.update(_addons.components)
return components

def get_hyperparameter_search_space(
self,
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
default: Optional[str] = None,
include: Optional[Dict[str, str]] = None,
exclude: Optional[Dict[str, str]] = None,
) -> ConfigurationSpace:
cs = ConfigurationSpace()

if dataset_properties is None:
dataset_properties = {}

# Compile a list of legal preprocessors for this problem
available_preprocessors = self.get_available_components(
dataset_properties=dataset_properties,
include=include, exclude=exclude)

if len(available_preprocessors) == 0:
raise ValueError(
"No bag of word encoders found, please add any bag of word encoder"
"component.")

if default is None:
#ToDo add the different verision
# how using 'relative' version
defaults = ['bag_of_words_encoding']
for default_ in defaults:
if default_ in available_preprocessors:
default = default_
break

# Todo how to add hps to available_preprocessors
preprocessor = CategoricalHyperparameter('__choice__',
list(
available_preprocessors.keys()),
default_value=default)
#ToDo add hiracical hps for the different choices
cs.add_hyperparameter(preprocessor)
for name in available_preprocessors:
preprocessor_configuration_space = available_preprocessors[name]. \
get_hyperparameter_search_space(dataset_properties)
parent_hyperparameter = {'parent': preprocessor, 'value': name}
cs.add_configuration_space(name, preprocessor_configuration_space,
parent_hyperparameter=parent_hyperparameter)

self.configuration_space = cs
self.dataset_properties = dataset_properties
return cs

def set_hyperparameters(self, configuration: Configuration,
init_params: Optional[Dict[str, Any]] = None
) -> 'BagOfWordChoice':
new_params = {}

params = configuration.get_dictionary()
choice = params['__choice__']
del params['__choice__']

for param, value in params.items():
param = param.replace(choice, '').replace(':', '')
new_params[param] = value

if init_params is not None:
for param, value in init_params.items():
# These next two lines are different than in the base class -
# they allow removing the categorical feature indicator array
# in order to not pass it to the no encoding
if choice not in param:
continue
param = param.replace(choice, '').replace(':', '')
new_params[param] = value

new_params['random_state'] = self.random_state

self.new_params = new_params
self.choice = self.get_components()[choice](**new_params)

return self

def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
return self.choice.transform(X)
Loading