Merge pull request #197 from automl/development

MAINT prepare release of version 0.1.1
automl · Nov 28, 2016 · 5d1931a · 5d1931a
2 parents bc873f6 + bf9593d
commit 5d1931a
Show file tree

Hide file tree

Showing 27 changed files with 913 additions and 1,008 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,49 +2,63 @@ language: python
 
 sudo: false
 
-os:
-  - linux
-  - osx
-
 matrix:
   allow_failures:
   - os: osx
+
+  include:
+  - os: linux
+    env: DISTRIB="conda" PYTHON_VERSION="3.4" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh"
+  - os: linux
+    env: DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh"
+
+  # Set language to generic to not break travis-ci
+  # https://github.com/travis-ci/travis-ci/issues/2312#issuecomment-195620855
+  # so far, this issue is still open and there is no good solution
+  # python will then be installed by anaconda
+  - os: osx
+    sudo: required
+    language: generic
+    env: DISTRIB="conda" PYTHON_VERSION="3.4" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh"
+  - os: osx
+    sudo: required
+    language: generic
+    env: DISTRIB="conda" PYTHON_VERSION="3.5" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh"
 
 cache:
-  apt: true
   # We use three different cache directory
   # to work around a Travis bug with multi-platform cache
   directories:
   - $HOME/.cache/pip
   - $HOME/download
   pip: true
 
-# command to install dependencies
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-    packages:
-    - gcc-4.8
-    - g++-4.8
-    - libatlas-dev
-    - liblapack-dev
-    - libatlas-base-dev
-    - gfortran
+git:
+  depth: 5
 
 env:
   global:
   # Directory where tests are run from
   - TEST_DIR=/tmp/test_dir/
   - MODULE=autosklearn
-  matrix:
-  - DISTRIB="conda" PYTHON_VERSION="3.4"
-  - DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true"
+
+before_install:
+  - wget $MINICONDA_URL -O miniconda.sh
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - if [[ `which conda` ]]; then echo 'Conda installation successful'; else exit 1; fi
+  - conda update --yes conda
+  - conda create -n testenv --yes python=$PYTHON_VERSION pip wheel nose
+  - source activate testenv
+  - conda install --yes gcc
+  - echo "Using GCC at "`which gcc`
+  - export CC=`which gcc`
 
 install:
-  # Necessary for random forest
-  - export CXX="g++-4.8" CC="gcc-4.8"
-  - source ci_scripts/install.sh
+  - pip install coverage pep8 python-coveralls
+  - cat requirements.txt | xargs -n 1 -L 1 pip install
+  - python setup.py install
+
 script: bash ci_scripts/test.sh
 after_success: source ci_scripts/success.sh
 
diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py
@@ -1,2 +1,14 @@
 # -*- encoding: utf-8 -*-
-__version__ = '0.1.0'
+from autosklearn.util import dependencies
+
+__version__ = '0.1.1'
+
+__MANDATORY_PACKAGES__ = '''
+scikit-learn==0.17.1
+smac==0.2.1
+lockfile>=0.10
+ConfigSpace>=0.2.1
+pyrfr==0.2.0
+'''
+
+dependencies.verify_packages(__MANDATORY_PACKAGES__)
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -19,12 +19,13 @@
 from autosklearn.data.data_manager_factory import get_data_manager
 from autosklearn.data.competition_data_manager import CompetitionDataManager
 from autosklearn.data.xy_data_manager import XYDataManager
-from autosklearn.evaluation import resampling, eval_with_limits
+from autosklearn.evaluation import resampling, ExecuteTaFuncWithQueue
 from autosklearn.evaluation import calculate_score
 from autosklearn.util import StopWatch, get_logger, setup_logger, \
     pipeline
 from autosklearn.ensemble_builder import EnsembleBuilder
 from autosklearn.smbo import AutoMLSMBO
+from autosklearn.util.hash import hash_numpy_array
 
 
 class AutoML(BaseEstimator):
@@ -71,7 +72,8 @@ def __init__(self,
         self._include_estimators = include_estimators
         self._include_preprocessors = include_preprocessors
         self._resampling_strategy = resampling_strategy
-        self._resampling_strategy_arguments = resampling_strategy_arguments
+        self._resampling_strategy_arguments = resampling_strategy_arguments \
+            if resampling_strategy_arguments is not None else {}
         self._max_iter_smac = max_iter_smac
         #self.delete_tmp_folder_after_terminate = \
         #    delete_tmp_folder_after_terminate
@@ -147,9 +149,7 @@ def fit(self, X, y,
         self._backend.context.create_directories()
 
         if dataset_name is None:
-            m = hashlib.md5()
-            m.update(X.data)
-            dataset_name = m.hexdigest()
+            dataset_name = hash_numpy_array(X)
 
         self._backend.save_start_time(self._seed)
         self._stopwatch = StopWatch()
@@ -232,37 +232,32 @@ def _print_load_time(basename, time_left_for_this_task,
     def _do_dummy_prediction(self, datamanager, num_run):
 
         self._logger.info("Starting to create dummy predictions.")
-        time_limit = int(self._time_for_task / 6.)
+        # time_limit = int(self._time_for_task / 6.)
         memory_limit = int(self._ml_memory_limit)
-
-        _info = eval_with_limits(datamanager, self._backend, 1,
-                                 self._seed, num_run,
-                                 self._resampling_strategy,
-                                 self._resampling_strategy_arguments,
-                                 memory_limit, time_limit,
-                                 logger=self._logger)
-        if _info[4] == StatusType.SUCCESS:
-            self._logger.info("Finished creating dummy prediction 1/2.")
-        else:
-            self._logger.error('Error creating dummy prediction 1/2:%s ',
-                               _info[3])
-
-        num_run += 1
-
-        _info = eval_with_limits(datamanager, self._backend, 2,
-                                 self._seed, num_run,
-                                 self._resampling_strategy,
-                                 self._resampling_strategy_arguments,
-                                 memory_limit, time_limit,
-                                 logger=self._logger)
-        if _info[4] == StatusType.SUCCESS:
-            self._logger.info("Finished creating dummy prediction 2/2.")
+        ta = ExecuteTaFuncWithQueue(backend=self._backend,
+                                    autosklearn_seed=self._seed,
+                                    resampling_strategy=self._resampling_strategy,
+                                    initial_num_run=num_run,
+                                    logger=self._logger,
+                                    **self._resampling_strategy_arguments)
+
+        status, cost, runtime, additional_info = \
+            ta.run(1, cutoff=self._time_for_task, memory_limit=memory_limit)
+        if status == StatusType.SUCCESS:
+            self._logger.info("Finished creating dummy predictions.")
         else:
-            self._logger.error('Error creating dummy prediction 2/2 %s',
-                               _info[3])
+            self._logger.error('Error creating dummy predictions:%s ',
+                               additional_info)
 
-        num_run += 1
-        return num_run
+        #status, cost, runtime, additional_info = \
+        #    ta.run(2, cutoff=time_limit, memory_limit=memory_limit)
+        #if status == StatusType.SUCCESS:
+        #    self._logger.info("Finished creating dummy prediction 2/2.")
+        #else:
+        #    self._logger.error('Error creating dummy prediction 2/2 %s',
+        #                       additional_info)
+
+        return ta.num_run
 
     def _fit(self, datamanager):
         # Reset learnt stuff
@@ -374,7 +369,7 @@ def _fit(self, datamanager):
         if time_left_for_smac <= 0:
             self._logger.warning("Not starting SMAC because there is no time "
                                  "left.")
-            self._proc_smac = None
+            _proc_smac = None
         else:
             if self._per_run_time_limit is None or \
                     self._per_run_time_limit > time_left_for_smac:
@@ -385,25 +380,25 @@ def _fit(self, datamanager):
             else:
                 per_run_time_limit = self._per_run_time_limit
 
-            self._proc_smac = AutoMLSMBO(config_space=self.configuration_space,
-                                         dataset_name=self._dataset_name,
-                                         backend=self._backend,
-                                         total_walltime_limit=time_left_for_smac,
-                                         func_eval_time_limit=per_run_time_limit,
-                                         memory_limit=self._ml_memory_limit,
-                                         data_memory_limit=self._data_memory_limit,
-                                         watcher=self._stopwatch,
-                                         start_num_run=num_run,
-                                         num_metalearning_cfgs=self._initial_configurations_via_metalearning,
-                                         config_file=configspace_path,
-                                         smac_iters=self._max_iter_smac,
-                                         seed=self._seed,
-                                         metadata_directory=self._metadata_directory,
-                                         resampling_strategy=self._resampling_strategy,
-                                         resampling_strategy_args=self._resampling_strategy_arguments,
-                                         acquisition_function=self.acquisition_function,
-                                         shared_mode=self._shared_mode)
-            self._proc_smac.run_smbo()
+            _proc_smac = AutoMLSMBO(config_space=self.configuration_space,
+                                    dataset_name=self._dataset_name,
+                                    backend=self._backend,
+                                    total_walltime_limit=time_left_for_smac,
+                                    func_eval_time_limit=per_run_time_limit,
+                                    memory_limit=self._ml_memory_limit,
+                                    data_memory_limit=self._data_memory_limit,
+                                    watcher=self._stopwatch,
+                                    start_num_run=num_run,
+                                    num_metalearning_cfgs=self._initial_configurations_via_metalearning,
+                                    config_file=configspace_path,
+                                    smac_iters=self._max_iter_smac,
+                                    seed=self._seed,
+                                    metadata_directory=self._metadata_directory,
+                                    resampling_strategy=self._resampling_strategy,
+                                    resampling_strategy_args=self._resampling_strategy_arguments,
+                                    acquisition_function=self.acquisition_function,
+                                    shared_mode=self._shared_mode)
+            self.runhistory_ = _proc_smac.run_smbo()
 
         self._proc_ensemble = None
         self._load_models()
@@ -418,12 +413,25 @@ def refit(self, X, y):
                 self.ensemble_ is None:
             self._load_models()
 
+        random_state = np.random.RandomState(self._seed)
         for identifier in self.models_:
             if identifier in self.ensemble_.get_model_identifiers():
                 model = self.models_[identifier]
                 # this updates the model inplace, it can then later be used in
                 # predict method
-                model.fit(X.copy(), y.copy())
+
+                # try to fit the model. If it fails, shuffle the data. This
+                # could alleviate the problem in algorithms that depend on
+                # the ordering of the data.
+                for i in range(10):
+                    try:
+                        model.fit(X.copy(), y.copy())
+                        break
+                    except ValueError:
+                        indices = list(range(X.shape[0]))
+                        random_state.shuffle(indices)
+                        X = X[indices]
+                        y = y[indices]
 
         self._can_predict = True
         return self
@@ -561,8 +569,8 @@ def grid_scores_(self):
         scores_per_config = defaultdict(list)
         config_list = list()
 
-        for run_key in self._proc_smac.runhistory.data:
-            run_value = self._proc_smac.runhistory.data[run_key]
+        for run_key in self.runhistory_.data:
+            run_value = self.runhistory_.data[run_key]
 
             config_id = run_key.config_id
             cost = run_value.cost
@@ -575,7 +583,7 @@ def grid_scores_(self):
         for config_id in config_list:
             scores = [1 - score for score in scores_per_config[config_id]]
             mean_score = np.mean(scores)
-            config = self._proc_smac.runhistory.ids_config[config_id]
+            config = self.runhistory_.ids_config[config_id]
 
             grid_score = _CVScoreTuple(config.get_dictionary(), mean_score,
                                        scores)
@@ -616,10 +624,10 @@ def cv_results_(self):
         mean_fit_time = []
         params = []
         status = []
-        for run_key in self._proc_smac.runhistory.data:
-            run_value = self._proc_smac.runhistory.data[run_key]
+        for run_key in self.runhistory_.data:
+            run_value = self.runhistory_.data[run_key]
             config_id = run_key.config_id
-            config = self._proc_smac.runhistory.ids_config[config_id]
+            config = self.runhistory_.ids_config[config_id]
 
             param_dict = config.get_dictionary()
             params.append(param_dict)

diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py
@@ -129,7 +129,7 @@ def main(self):
                 if dir_ensemble_file.endswith("/"):
                     dir_ensemble_file = dir_ensemble_file[:-1]
                 if not dir_ensemble_file.endswith(".npy"):
-                    self.logger.warning('Error loading file (not .npy): %s', dir_ensemble_file)
+                    self.logger.info('Error loading file (not .npy): %s', dir_ensemble_file)
                     continue
 
                 dir_ensemble_model_files.append(dir_ensemble_file)