Skip to content

Commit

Permalink
Merge pull request #197 from automl/development
Browse files Browse the repository at this point in the history
MAINT prepare release of version 0.1.1
  • Loading branch information
mfeurer authored Nov 28, 2016
2 parents bc873f6 + bf9593d commit 5d1931a
Show file tree
Hide file tree
Showing 27 changed files with 913 additions and 1,008 deletions.
60 changes: 37 additions & 23 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,63 @@ language: python

sudo: false

os:
- linux
- osx

matrix:
allow_failures:
- os: osx

include:
- os: linux
env: DISTRIB="conda" PYTHON_VERSION="3.4" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh"
- os: linux
env: DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh"

# Set language to generic to not break travis-ci
# https://github.com/travis-ci/travis-ci/issues/2312#issuecomment-195620855
# so far, this issue is still open and there is no good solution
# python will then be installed by anaconda
- os: osx
sudo: required
language: generic
env: DISTRIB="conda" PYTHON_VERSION="3.4" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh"
- os: osx
sudo: required
language: generic
env: DISTRIB="conda" PYTHON_VERSION="3.5" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh"

cache:
apt: true
# We use three different cache directory
# to work around a Travis bug with multi-platform cache
directories:
- $HOME/.cache/pip
- $HOME/download
pip: true

# command to install dependencies
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- g++-4.8
- libatlas-dev
- liblapack-dev
- libatlas-base-dev
- gfortran
git:
depth: 5

env:
global:
# Directory where tests are run from
- TEST_DIR=/tmp/test_dir/
- MODULE=autosklearn
matrix:
- DISTRIB="conda" PYTHON_VERSION="3.4"
- DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true"

before_install:
- wget $MINICONDA_URL -O miniconda.sh
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- if [[ `which conda` ]]; then echo 'Conda installation successful'; else exit 1; fi
- conda update --yes conda
- conda create -n testenv --yes python=$PYTHON_VERSION pip wheel nose
- source activate testenv
- conda install --yes gcc
- echo "Using GCC at "`which gcc`
- export CC=`which gcc`

install:
# Necessary for random forest
- export CXX="g++-4.8" CC="gcc-4.8"
- source ci_scripts/install.sh
- pip install coverage pep8 python-coveralls
- cat requirements.txt | xargs -n 1 -L 1 pip install
- python setup.py install

script: bash ci_scripts/test.sh
after_success: source ci_scripts/success.sh

14 changes: 13 additions & 1 deletion autosklearn/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
# -*- encoding: utf-8 -*-
__version__ = '0.1.0'
from autosklearn.util import dependencies

__version__ = '0.1.1'

__MANDATORY_PACKAGES__ = '''
scikit-learn==0.17.1
smac==0.2.1
lockfile>=0.10
ConfigSpace>=0.2.1
pyrfr==0.2.0
'''

dependencies.verify_packages(__MANDATORY_PACKAGES__)
128 changes: 68 additions & 60 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@
from autosklearn.data.data_manager_factory import get_data_manager
from autosklearn.data.competition_data_manager import CompetitionDataManager
from autosklearn.data.xy_data_manager import XYDataManager
from autosklearn.evaluation import resampling, eval_with_limits
from autosklearn.evaluation import resampling, ExecuteTaFuncWithQueue
from autosklearn.evaluation import calculate_score
from autosklearn.util import StopWatch, get_logger, setup_logger, \
pipeline
from autosklearn.ensemble_builder import EnsembleBuilder
from autosklearn.smbo import AutoMLSMBO
from autosklearn.util.hash import hash_numpy_array


class AutoML(BaseEstimator):
Expand Down Expand Up @@ -71,7 +72,8 @@ def __init__(self,
self._include_estimators = include_estimators
self._include_preprocessors = include_preprocessors
self._resampling_strategy = resampling_strategy
self._resampling_strategy_arguments = resampling_strategy_arguments
self._resampling_strategy_arguments = resampling_strategy_arguments \
if resampling_strategy_arguments is not None else {}
self._max_iter_smac = max_iter_smac
#self.delete_tmp_folder_after_terminate = \
# delete_tmp_folder_after_terminate
Expand Down Expand Up @@ -147,9 +149,7 @@ def fit(self, X, y,
self._backend.context.create_directories()

if dataset_name is None:
m = hashlib.md5()
m.update(X.data)
dataset_name = m.hexdigest()
dataset_name = hash_numpy_array(X)

self._backend.save_start_time(self._seed)
self._stopwatch = StopWatch()
Expand Down Expand Up @@ -232,37 +232,32 @@ def _print_load_time(basename, time_left_for_this_task,
def _do_dummy_prediction(self, datamanager, num_run):

self._logger.info("Starting to create dummy predictions.")
time_limit = int(self._time_for_task / 6.)
# time_limit = int(self._time_for_task / 6.)
memory_limit = int(self._ml_memory_limit)

_info = eval_with_limits(datamanager, self._backend, 1,
self._seed, num_run,
self._resampling_strategy,
self._resampling_strategy_arguments,
memory_limit, time_limit,
logger=self._logger)
if _info[4] == StatusType.SUCCESS:
self._logger.info("Finished creating dummy prediction 1/2.")
else:
self._logger.error('Error creating dummy prediction 1/2:%s ',
_info[3])

num_run += 1

_info = eval_with_limits(datamanager, self._backend, 2,
self._seed, num_run,
self._resampling_strategy,
self._resampling_strategy_arguments,
memory_limit, time_limit,
logger=self._logger)
if _info[4] == StatusType.SUCCESS:
self._logger.info("Finished creating dummy prediction 2/2.")
ta = ExecuteTaFuncWithQueue(backend=self._backend,
autosklearn_seed=self._seed,
resampling_strategy=self._resampling_strategy,
initial_num_run=num_run,
logger=self._logger,
**self._resampling_strategy_arguments)

status, cost, runtime, additional_info = \
ta.run(1, cutoff=self._time_for_task, memory_limit=memory_limit)
if status == StatusType.SUCCESS:
self._logger.info("Finished creating dummy predictions.")
else:
self._logger.error('Error creating dummy prediction 2/2 %s',
_info[3])
self._logger.error('Error creating dummy predictions:%s ',
additional_info)

num_run += 1
return num_run
#status, cost, runtime, additional_info = \
# ta.run(2, cutoff=time_limit, memory_limit=memory_limit)
#if status == StatusType.SUCCESS:
# self._logger.info("Finished creating dummy prediction 2/2.")
#else:
# self._logger.error('Error creating dummy prediction 2/2 %s',
# additional_info)

return ta.num_run

def _fit(self, datamanager):
# Reset learnt stuff
Expand Down Expand Up @@ -374,7 +369,7 @@ def _fit(self, datamanager):
if time_left_for_smac <= 0:
self._logger.warning("Not starting SMAC because there is no time "
"left.")
self._proc_smac = None
_proc_smac = None
else:
if self._per_run_time_limit is None or \
self._per_run_time_limit > time_left_for_smac:
Expand All @@ -385,25 +380,25 @@ def _fit(self, datamanager):
else:
per_run_time_limit = self._per_run_time_limit

self._proc_smac = AutoMLSMBO(config_space=self.configuration_space,
dataset_name=self._dataset_name,
backend=self._backend,
total_walltime_limit=time_left_for_smac,
func_eval_time_limit=per_run_time_limit,
memory_limit=self._ml_memory_limit,
data_memory_limit=self._data_memory_limit,
watcher=self._stopwatch,
start_num_run=num_run,
num_metalearning_cfgs=self._initial_configurations_via_metalearning,
config_file=configspace_path,
smac_iters=self._max_iter_smac,
seed=self._seed,
metadata_directory=self._metadata_directory,
resampling_strategy=self._resampling_strategy,
resampling_strategy_args=self._resampling_strategy_arguments,
acquisition_function=self.acquisition_function,
shared_mode=self._shared_mode)
self._proc_smac.run_smbo()
_proc_smac = AutoMLSMBO(config_space=self.configuration_space,
dataset_name=self._dataset_name,
backend=self._backend,
total_walltime_limit=time_left_for_smac,
func_eval_time_limit=per_run_time_limit,
memory_limit=self._ml_memory_limit,
data_memory_limit=self._data_memory_limit,
watcher=self._stopwatch,
start_num_run=num_run,
num_metalearning_cfgs=self._initial_configurations_via_metalearning,
config_file=configspace_path,
smac_iters=self._max_iter_smac,
seed=self._seed,
metadata_directory=self._metadata_directory,
resampling_strategy=self._resampling_strategy,
resampling_strategy_args=self._resampling_strategy_arguments,
acquisition_function=self.acquisition_function,
shared_mode=self._shared_mode)
self.runhistory_ = _proc_smac.run_smbo()

self._proc_ensemble = None
self._load_models()
Expand All @@ -418,12 +413,25 @@ def refit(self, X, y):
self.ensemble_ is None:
self._load_models()

random_state = np.random.RandomState(self._seed)
for identifier in self.models_:
if identifier in self.ensemble_.get_model_identifiers():
model = self.models_[identifier]
# this updates the model inplace, it can then later be used in
# predict method
model.fit(X.copy(), y.copy())

# try to fit the model. If it fails, shuffle the data. This
# could alleviate the problem in algorithms that depend on
# the ordering of the data.
for i in range(10):
try:
model.fit(X.copy(), y.copy())
break
except ValueError:
indices = list(range(X.shape[0]))
random_state.shuffle(indices)
X = X[indices]
y = y[indices]

self._can_predict = True
return self
Expand Down Expand Up @@ -561,8 +569,8 @@ def grid_scores_(self):
scores_per_config = defaultdict(list)
config_list = list()

for run_key in self._proc_smac.runhistory.data:
run_value = self._proc_smac.runhistory.data[run_key]
for run_key in self.runhistory_.data:
run_value = self.runhistory_.data[run_key]

config_id = run_key.config_id
cost = run_value.cost
Expand All @@ -575,7 +583,7 @@ def grid_scores_(self):
for config_id in config_list:
scores = [1 - score for score in scores_per_config[config_id]]
mean_score = np.mean(scores)
config = self._proc_smac.runhistory.ids_config[config_id]
config = self.runhistory_.ids_config[config_id]

grid_score = _CVScoreTuple(config.get_dictionary(), mean_score,
scores)
Expand Down Expand Up @@ -616,10 +624,10 @@ def cv_results_(self):
mean_fit_time = []
params = []
status = []
for run_key in self._proc_smac.runhistory.data:
run_value = self._proc_smac.runhistory.data[run_key]
for run_key in self.runhistory_.data:
run_value = self.runhistory_.data[run_key]
config_id = run_key.config_id
config = self._proc_smac.runhistory.ids_config[config_id]
config = self.runhistory_.ids_config[config_id]

param_dict = config.get_dictionary()
params.append(param_dict)
Expand Down
2 changes: 1 addition & 1 deletion autosklearn/ensemble_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def main(self):
if dir_ensemble_file.endswith("/"):
dir_ensemble_file = dir_ensemble_file[:-1]
if not dir_ensemble_file.endswith(".npy"):
self.logger.warning('Error loading file (not .npy): %s', dir_ensemble_file)
self.logger.info('Error loading file (not .npy): %s', dir_ensemble_file)
continue

dir_ensemble_model_files.append(dir_ensemble_file)
Expand Down
Loading

0 comments on commit 5d1931a

Please sign in to comment.