-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
* commit meta learning data bases * commit changed files * commit new files * fixed experimental settings * implemented last comments on old PR * adapted metalearning to last commit * add a text preprocessing example * intigrated feedback * new changes on *.csv files * reset changes * add changes for merging * add changes for merging * add changes for merging * try to merge * fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale) * fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale) * fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale) * init * init * commit changes for text preprocessing * text prepreprocessing commit * fix metalearning * fix metalearning * adapted test to new text feature * fix style guide issues * integrate PR comments * integrate PR comments * implemented the comments to the last PR * fitted operation is not in place therefore we have to assgin the fitted self.preprocessor again to it self * add first text processing tests * add first text processing tests * including comments from 01.25. * including comments from 01.28. * including comments from 01.28. * including comments from 01.28. * including comments from 01.31.
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from typing import Dict, Optional, Tuple, Union | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
import ConfigSpace.hyperparameters as CSH | ||
|
||
import numpy as np | ||
|
||
from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE | ||
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm | ||
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT | ||
|
||
from sklearn.decomposition import TruncatedSVD | ||
|
||
|
||
class FeatureReduction(AutoSklearnPreprocessingAlgorithm): | ||
""" | ||
Reduces the features created by a bag of words encoding | ||
""" | ||
|
||
def __init__( | ||
self, | ||
n_components: Optional[int] = None, | ||
random_state: Optional[Union[int, np.random.RandomState]] = None | ||
) -> None: | ||
self.n_components = n_components | ||
self.random_state = random_state | ||
|
||
def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None | ||
) -> 'FeatureReduction': | ||
if X.shape[1] > self.n_components: | ||
self.preprocessor = TruncatedSVD(n_components=self.n_components, | ||
random_state=self.random_state) | ||
elif X.shape[1] <= self.n_components and X.shape[1] != 1: | ||
self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1, | ||
random_state=self.random_state) | ||
else: | ||
raise ValueError("The text embedding consists only of a single dimension.\n" | ||
"Are you sure that your text data is necessary?") | ||
self.preprocessor.fit(X) | ||
return self | ||
|
||
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: | ||
if self.preprocessor is None: | ||
raise NotImplementedError() | ||
return self.preprocessor.transform(X) | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None | ||
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: | ||
return {'shortname': 'TextFeatureReduction', | ||
'name': 'TextFeatureReduction', | ||
'handles_missing_values': True, | ||
'handles_nominal_values': True, | ||
'handles_numerical_features': True, | ||
'prefers_data_scaled': False, | ||
'prefers_data_normalized': False, | ||
'handles_regression': True, | ||
'handles_classification': True, | ||
'handles_multiclass': True, | ||
'handles_multilabel': True, | ||
'handles_multioutput': True, | ||
'is_deterministic': True, | ||
'handles_sparse': True, | ||
'handles_dense': True, | ||
'input': (DENSE, SPARSE, UNSIGNED_DATA), | ||
'output': (INPUT,), | ||
'preferred_dtype': None} | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None | ||
) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
cs.add_hyperparameter( | ||
CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000, | ||
default_value=100, log=True)) | ||
return cs |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from typing import Any, List, Dict, Optional, Tuple, Union | ||
|
||
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace | ||
|
||
import numpy as np | ||
|
||
from sklearn.base import BaseEstimator | ||
|
||
from autosklearn.pipeline.components.data_preprocessing.text_encoding \ | ||
import BagOfWordChoice | ||
from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \ | ||
FeatureReduction | ||
from autosklearn.pipeline.base import ( | ||
BasePipeline, | ||
DATASET_PROPERTIES_TYPE, | ||
) | ||
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT | ||
|
||
|
||
class TextPreprocessingPipeline(BasePipeline): | ||
"""This class implements a pipeline for data preprocessing of text features. | ||
It assumes that the data to be transformed is made only of text features. | ||
The steps of this pipeline are: | ||
1 - Vectorize: Fits a *Vecotrizer object and apply this | ||
2 - text feature reduction: TruncatedSVD | ||
Parameters | ||
---------- | ||
config : ConfigSpace.configuration_space.Configuration | ||
The configuration to evaluate. | ||
random_state : Optional[int | RandomState] | ||
If int, random_state is the seed used by the random number generator; | ||
If RandomState instance, random_state is the random number generator; | ||
If None, the random number generator is the RandomState instance | ||
used by `np.random`.""" | ||
|
||
def __init__( | ||
self, | ||
config: Optional[Configuration] = None, | ||
steps: Optional[List[Tuple[str, BaseEstimator]]] = None, | ||
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, | ||
include: Optional[Dict[str, str]] = None, | ||
exclude: Optional[Dict[str, str]] = None, | ||
random_state: Optional[Union[int, np.random.RandomState]] = None, | ||
init_params: Optional[Dict[str, Any]] = None | ||
) -> None: | ||
self._output_dtype = np.int32 | ||
super().__init__( | ||
config, steps, dataset_properties, include, exclude, | ||
random_state, init_params | ||
) | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None | ||
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: | ||
return {'shortname': 'txt_datapreproc', | ||
'name': 'text data preprocessing', | ||
'handles_missing_values': True, | ||
'handles_nominal_values': False, | ||
'handles_numerical_features': False, | ||
'prefers_data_scaled': False, | ||
'prefers_data_normalized': False, | ||
'handles_regression': True, | ||
'handles_classification': True, | ||
'handles_multiclass': True, | ||
'handles_multilabel': True, | ||
'is_deterministic': True, | ||
'handles_sparse': True, | ||
'handles_dense': True, | ||
'input': (DENSE, SPARSE, UNSIGNED_DATA), | ||
'output': (INPUT,), | ||
'preferred_dtype': None} | ||
|
||
def _get_hyperparameter_search_space( | ||
self, | ||
include: Optional[Dict[str, str]] = None, | ||
exclude: Optional[Dict[str, str]] = None, | ||
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, | ||
) -> ConfigurationSpace: | ||
"""Create the hyperparameter configuration space. | ||
Parameters | ||
---------- | ||
# TODO add parameter description | ||
Returns | ||
------- | ||
cs : ConfigSpace.configuration_space.Configuration | ||
The configuration space describing the SimpleRegressionClassifier. | ||
""" | ||
cs = ConfigurationSpace() | ||
if dataset_properties is None or not isinstance(dataset_properties, dict): | ||
dataset_properties = dict() | ||
|
||
cs = self._get_base_search_space( | ||
cs=cs, dataset_properties=dataset_properties, | ||
exclude=exclude, include=include, pipeline=self.steps) | ||
|
||
return cs | ||
|
||
def _get_pipeline_steps(self, | ||
dataset_properties: Optional[Dict[str, str]] = None, | ||
) -> List[Tuple[str, BaseEstimator]]: | ||
steps = [] | ||
|
||
default_dataset_properties = {} | ||
if dataset_properties is not None and isinstance(dataset_properties, dict): | ||
default_dataset_properties.update(dataset_properties) | ||
|
||
steps.extend([ | ||
("text_encoding", BagOfWordChoice(default_dataset_properties, | ||
random_state=self.random_state)), | ||
("feature_reduction", FeatureReduction(random_state=self.random_state)) | ||
]) | ||
return steps | ||
|
||
def _get_estimator_hyperparameter_name(self) -> str: | ||
return "text data preprocessing" |