From a0a5dcc1abe25503e143fac932627e3ac92870de Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 18 Oct 2016 09:26:01 +0200 Subject: [PATCH 01/38] FIX run automl object tests on travis --- test/test_automl/test_estimators.py | 3 --- test/test_automl/test_pickle.py | 3 --- test/test_automl/test_start_automl.py | 6 ------ 3 files changed, 12 deletions(-) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 0e32504b78..81e73658fa 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -37,9 +37,6 @@ class EstimatorTest(Base, unittest.TestCase): _multiprocess_can_split_ = True def test_fit(self): - if self.travis: - self.skipTest('This test does currently not run on travis-ci. ' - 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) diff --git a/test/test_automl/test_pickle.py b/test/test_automl/test_pickle.py index f6fe2041b4..0206a0d8cf 100644 --- a/test/test_automl/test_pickle.py +++ b/test/test_automl/test_pickle.py @@ -13,9 +13,6 @@ class PicklingTests(Base, unittest.TestCase): def test_can_pickle_classifier(self): - if self.travis: - self.skipTest('This test does currently not run on travis-ci. ' - 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_can_pickle') self._setUp(output) diff --git a/test/test_automl/test_start_automl.py b/test/test_automl/test_start_automl.py index c7b962609d..0221e39b67 100644 --- a/test/test_automl/test_start_automl.py +++ b/test/test_automl/test_start_automl.py @@ -24,9 +24,6 @@ class AutoMLTest(Base, unittest.TestCase): _multiprocess_can_split_ = True def test_fit(self): - if self.travis: - self.skipTest('This test does currently not run on travis-ci. ' - 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) @@ -47,9 +44,6 @@ def test_binary_score(self): Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ - if self.travis: - self.skipTest('This test does currently not run on travis-ci. ' - 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) From cc44d0398edd1c7843b5cba3cf3fafd344e41984 Mon Sep 17 00:00:00 2001 From: Anatolii Domashnev Date: Tue, 18 Oct 2016 14:44:29 +0200 Subject: [PATCH 02/38] Use info instead of warning for casual files in models dirs --- autosklearn/ensemble_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 25e3554f01..f72546a538 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -129,7 +129,7 @@ def main(self): if dir_ensemble_file.endswith("/"): dir_ensemble_file = dir_ensemble_file[:-1] if not dir_ensemble_file.endswith(".npy"): - self.logger.warning('Error loading file (not .npy): %s', dir_ensemble_file) + self.logger.info('Error loading file (not .npy): %s', dir_ensemble_file) continue dir_ensemble_model_files.append(dir_ensemble_file) From 1cbf0447e680c3dd71f00d69778e96671d01da85 Mon Sep 17 00:00:00 2001 From: Timothy J Laurent Date: Wed, 19 Oct 2016 12:30:33 -0700 Subject: [PATCH 03/38] Change `cat` to `curl` in installation directions `cat` isn't the appropriate command for this step, `curl` works great though --- doc/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/index.rst b/doc/index.rst index 9745252457..592af974cc 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -56,7 +56,7 @@ Please install all dependencies manually with: .. code:: bash - cat https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install + curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install Then install *auto-sklearn* From bf4c30f6debe20f565f8723eadde4a14f37f35c4 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 09:58:07 +0200 Subject: [PATCH 04/38] CI use anaconda gcc compiler instead of ubuntu/debian one --- .travis.yml | 6 +++--- ci_scripts/install.sh | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 17bbdfccd3..ea79bc9310 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,8 +25,8 @@ addons: sources: - ubuntu-toolchain-r-test packages: - - gcc-4.8 - - g++-4.8 + #- gcc-4.8 + #- g++-4.8 - libatlas-dev - liblapack-dev - libatlas-base-dev @@ -43,7 +43,7 @@ env: install: # Necessary for random forest - - export CXX="g++-4.8" CC="gcc-4.8" + #- export CXX="g++-4.8" CC="gcc-4.8" - source ci_scripts/install.sh script: bash ci_scripts/test.sh after_success: source ci_scripts/success.sh diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh index ed1e959bfd..86473fdc28 100644 --- a/ci_scripts/install.sh +++ b/ci_scripts/install.sh @@ -27,6 +27,10 @@ popd conda create -n testenv --yes python=$PYTHON_VERSION pip nose source activate testenv +# Install anaconda gcc compiler to have compiler compatible with the +# anaconda python executable +conda install gcc + # Install requirements in correct order cat requirements.txt | xargs -n 1 -L 1 pip install From 4d5bff3a75fecd02d0a778e095200d3440206568 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 10:10:32 +0200 Subject: [PATCH 05/38] CI stop conda from asking for confirmation --- ci_scripts/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh index 86473fdc28..8a76a0914c 100644 --- a/ci_scripts/install.sh +++ b/ci_scripts/install.sh @@ -29,7 +29,7 @@ source activate testenv # Install anaconda gcc compiler to have compiler compatible with the # anaconda python executable -conda install gcc +conda install gcc --yes # Install requirements in correct order cat requirements.txt | xargs -n 1 -L 1 pip install From 3cf134cae8151acf9df4d2fe6ff8a6ee6950916f Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 10:49:06 +0200 Subject: [PATCH 06/38] CI completely disable use of apt-get --- .travis.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index ea79bc9310..0cc5cb9334 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ matrix: - os: osx cache: - apt: true + #apt: true # We use three different cache directory # to work around a Travis bug with multi-platform cache directories: @@ -20,17 +20,17 @@ cache: pip: true # command to install dependencies -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - #- gcc-4.8 - #- g++-4.8 - - libatlas-dev - - liblapack-dev - - libatlas-base-dev - - gfortran +#addons: +# apt: +# sources: +# - ubuntu-toolchain-r-test +# packages: +# - gcc-4.8 +# - g++-4.8 +# - libatlas-dev +# - liblapack-dev +# - libatlas-base-dev +# - gfortran env: global: From 2a76e2ea9b8af556c367b9cd40017bb9ea9b65db Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 11:24:15 +0200 Subject: [PATCH 07/38] CI set language to generic for OSX --- .travis.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.travis.yml b/.travis.yml index 0cc5cb9334..cfd595168f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,18 @@ os: matrix: allow_failures: - os: osx + + include: + - os: linux + python: 3.5 + + # Set language to generic to not break travis-ci + # https://github.com/travis-ci/travis-ci/issues/2312#issuecomment-195620855 + # so far, this issue is still open and there is no good solution + # python will then be installed by anaconda + - os: osx + sudo: required + language: generic cache: #apt: true From 47e70200417b2baae0b739b250d6be2dc9505a66 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 12:59:03 +0200 Subject: [PATCH 08/38] CI specify full build matrix --- .travis.yml | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/.travis.yml b/.travis.yml index cfd595168f..9fd0720d85 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,9 @@ matrix: include: - os: linux - python: 3.5 + DISTRIB="conda" PYTHON_VERSION="3.4" + - os: linux + DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" # Set language to generic to not break travis-ci # https://github.com/travis-ci/travis-ci/issues/2312#issuecomment-195620855 @@ -21,9 +23,12 @@ matrix: - os: osx sudo: required language: generic + DISTRIB="conda" PYTHON_VERSION="3.4" + - os: osx + sudo: required + language: genericDISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" cache: - #apt: true # We use three different cache directory # to work around a Travis bug with multi-platform cache directories: @@ -31,27 +36,11 @@ cache: - $HOME/download pip: true -# command to install dependencies -#addons: -# apt: -# sources: -# - ubuntu-toolchain-r-test -# packages: -# - gcc-4.8 -# - g++-4.8 -# - libatlas-dev -# - liblapack-dev -# - libatlas-base-dev -# - gfortran - env: global: # Directory where tests are run from - TEST_DIR=/tmp/test_dir/ - MODULE=autosklearn - matrix: - - DISTRIB="conda" PYTHON_VERSION="3.4" - - DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" install: # Necessary for random forest From d45cc14bf1b3f7450cfe4edfab1b3626bfade49a Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 13:04:06 +0200 Subject: [PATCH 09/38] CI add env identifier to fix format error --- .travis.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9fd0720d85..ff082dbc0f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,9 +12,9 @@ matrix: include: - os: linux - DISTRIB="conda" PYTHON_VERSION="3.4" + env: DISTRIB="conda" PYTHON_VERSION="3.4" - os: linux - DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" + env: DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" # Set language to generic to not break travis-ci # https://github.com/travis-ci/travis-ci/issues/2312#issuecomment-195620855 @@ -23,10 +23,11 @@ matrix: - os: osx sudo: required language: generic - DISTRIB="conda" PYTHON_VERSION="3.4" + env: DISTRIB="conda" PYTHON_VERSION="3.4" - os: osx sudo: required - language: genericDISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" + language: generic + env: DISTRIB="conda" PYTHON_VERSION="3.5" cache: # We use three different cache directory From 28a4d5e80f12a7188b837a0b7e18322079993ab3 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 24 Oct 2016 13:11:57 +0200 Subject: [PATCH 10/38] CI remove OS from build matrix --- .travis.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index ff082dbc0f..9db11785ef 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,10 +2,6 @@ language: python sudo: false -os: - - linux - - osx - matrix: allow_failures: - os: osx @@ -37,6 +33,9 @@ cache: - $HOME/download pip: true +git: + depth: 5 + env: global: # Directory where tests are run from @@ -44,9 +43,8 @@ env: - MODULE=autosklearn install: - # Necessary for random forest - #- export CXX="g++-4.8" CC="gcc-4.8" - source ci_scripts/install.sh + script: bash ci_scripts/test.sh after_success: source ci_scripts/success.sh From 95a925c43b78290b42345838c2eeebbd72acc927 Mon Sep 17 00:00:00 2001 From: Anatolii Domashnev Date: Wed, 9 Nov 2016 16:30:25 +0100 Subject: [PATCH 11/38] ADD warning if dependencies are not met #170 --- autosklearn/__init__.py | 12 +++++ autosklearn/util/dependencies.py | 55 ++++++++++++++++++++++ test/test_util/test_dependencies.py | 72 +++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 autosklearn/util/dependencies.py create mode 100644 test/test_util/test_dependencies.py diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py index 2051e8e4cc..209195e1aa 100644 --- a/autosklearn/__init__.py +++ b/autosklearn/__init__.py @@ -1,2 +1,14 @@ # -*- encoding: utf-8 -*- +from autosklearn.util import dependencies + __version__ = '0.1.0' + +__MANDATORY_PACKAGES__ = ''' +scikit-learn==0.17.1 +smac==0.0.1 +lockfile>=0.10 +ConfigSpace>=0.2.1 +pyrfr==0.2.0 +''' + +dependencies.verify_packages(__MANDATORY_PACKAGES__) diff --git a/autosklearn/util/dependencies.py b/autosklearn/util/dependencies.py new file mode 100644 index 0000000000..9ec500902d --- /dev/null +++ b/autosklearn/util/dependencies.py @@ -0,0 +1,55 @@ +from warnings import warn + +import pkg_resources +import re + +from distutils.version import LooseVersion + + +RE_PATTERN = re.compile('^(?P[\w\-]+)((?P==|>=|>)(?P(\d+\.)?(\d+\.)?(\d+)))?$') + + +def verify_packages(packages): + if not packages: + return + if isinstance(packages, str): + packages = packages.splitlines() + + for package in packages: + if not package: + continue + + match = RE_PATTERN.match(package) + if match: + name = match.group('name') + operation = match.group('operation') + version = match.group('version') + _verify_package(name, operation, version) + else: + raise ValueError('Unable to read requirement: %s' % package) + + +def _verify_package(name, operation, version): + try: + module = pkg_resources.get_distribution(name) + except pkg_resources.DistributionNotFound: + warn('mandatory package \'%s\' not found' % name) + return + + if not operation: + return + + required_version = LooseVersion(version) + installed_version = LooseVersion(module.version) + + if operation == '==': + check = required_version == installed_version + elif operation == '>': + check = installed_version > required_version + elif operation == '>=': + check = installed_version > required_version or \ + installed_version == required_version + else: + raise NotImplementedError('operation %s is not supported' % operation) + if not check: + warn('\'%s\' version mismatch (%s%s)' % (name, operation, required_version)) diff --git a/test/test_util/test_dependencies.py b/test/test_util/test_dependencies.py new file mode 100644 index 0000000000..40040fd09d --- /dev/null +++ b/test/test_util/test_dependencies.py @@ -0,0 +1,72 @@ +import unittest +import warnings +import re + +from unittest.mock import patch, Mock + +import pkg_resources + +from autosklearn.util.dependencies import verify_packages, _verify_package + + +@patch('pkg_resources.get_distribution') +class VerifyPackagesTests(unittest.TestCase): + + def test_existing_package(self, getDistributionMock): + requirement = 'package' + + with warnings.catch_warnings(record=True) as w: + verify_packages(requirement) + self.assertEqual(0, len(w)) + + getDistributionMock.assert_called_once_with('package') + + def test_missing_package(self, getDistributionMock): + requirement = 'package' + + getDistributionMock.side_effect = pkg_resources.DistributionNotFound() + + self.assertWarnsRegex(UserWarning, "mandatory package 'package' not found", verify_packages, requirement) + + def test_correct_package_versions(self, getDistributionMock): + requirement = 'package==0.1.2\n' \ + 'package>0.1\n' \ + 'package>=0.1' + + moduleMock = Mock() + moduleMock.version = '0.1.2' + getDistributionMock.return_value = moduleMock + + with warnings.catch_warnings(record=True) as w: + verify_packages(requirement) + self.assertEqual(0, len(w)) + + getDistributionMock.assert_called_with('package') + self.assertEqual(3, len(getDistributionMock.call_args_list)) + + def test_wrong_package_version(self, getDistributionMock): + requirement = 'package>0.1.2' + + moduleMock = Mock() + moduleMock.version = '0.1.2' + getDistributionMock.return_value = moduleMock + + self.assertWarnsRegex(UserWarning, re.escape("'package' version mismatch (>0.1.2)"), verify_packages, requirement) + + def test_outdated_requirement(self, getDistributionMock): + requirement = 'package>=0.1' + + moduleMock = Mock() + moduleMock.version = '0.0.9' + getDistributionMock.return_value = moduleMock + + self.assertWarnsRegex(UserWarning, re.escape("'package' version mismatch (>=0.1)"), verify_packages, requirement) + + def test_too_fresh_requirement(self, getDistributionMock): + requirement = 'package==0.1.2' + + moduleMock = Mock() + moduleMock.version = '0.1.3' + getDistributionMock.return_value = moduleMock + + self.assertWarnsRegex(UserWarning, re.escape("'package' version mismatch (==0.1.2)"), verify_packages, requirement) \ No newline at end of file From ca75c5700f90ba0b051b26ca095487eb5a94be3b Mon Sep 17 00:00:00 2001 From: Anatolii Domashnev Date: Wed, 9 Nov 2016 17:54:41 +0100 Subject: [PATCH 12/38] Throw exception instead of warnings --- autosklearn/util/dependencies.py | 32 +++++++++++++++++++++++++---- test/test_util/test_dependencies.py | 23 +++++++++++---------- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/autosklearn/util/dependencies.py b/autosklearn/util/dependencies.py index 9ec500902d..61ed76adf5 100644 --- a/autosklearn/util/dependencies.py +++ b/autosklearn/util/dependencies.py @@ -33,8 +33,7 @@ def _verify_package(name, operation, version): try: module = pkg_resources.get_distribution(name) except pkg_resources.DistributionNotFound: - warn('mandatory package \'%s\' not found' % name) - return + raise MissingPackageError(name) from None if not operation: return @@ -50,6 +49,31 @@ def _verify_package(name, operation, version): check = installed_version > required_version or \ installed_version == required_version else: - raise NotImplementedError('operation %s is not supported' % operation) + raise NotImplementedError('operation \'%s\' is not supported' % operation) if not check: - warn('\'%s\' version mismatch (%s%s)' % (name, operation, required_version)) + raise IncorrectPackageVersionError(name, installed_version, operation, required_version) + + +class MissingPackageError(Exception): + + error_message = 'mandatory package \'{name}\' not found' + + def __init__(self, package_name): + self.package_name = package_name + super(MissingPackageError, self).__init__(self.error_message.format(name=package_name)) + + +class IncorrectPackageVersionError(Exception): + + error_message = '\'{name} {installed_version}\' version mismatch ({operation}{required_version})' + + def __init__(self, package_name, installed_version, operation, required_version): + self.package_name = package_name + self.installed_version = installed_version + self.operation = operation + self.required_version = required_version + message = self.error_message.format(name=package_name, + installed_version=installed_version, + operation=operation, + required_version=required_version) + super(IncorrectPackageVersionError, self).__init__(message) diff --git a/test/test_util/test_dependencies.py b/test/test_util/test_dependencies.py index 40040fd09d..dc561c97c9 100644 --- a/test/test_util/test_dependencies.py +++ b/test/test_util/test_dependencies.py @@ -6,7 +6,8 @@ import pkg_resources -from autosklearn.util.dependencies import verify_packages, _verify_package +from autosklearn.util.dependencies import verify_packages, _verify_package, MissingPackageError, \ + IncorrectPackageVersionError @patch('pkg_resources.get_distribution') @@ -15,9 +16,7 @@ class VerifyPackagesTests(unittest.TestCase): def test_existing_package(self, getDistributionMock): requirement = 'package' - with warnings.catch_warnings(record=True) as w: - verify_packages(requirement) - self.assertEqual(0, len(w)) + verify_packages(requirement) getDistributionMock.assert_called_once_with('package') @@ -26,7 +25,8 @@ def test_missing_package(self, getDistributionMock): getDistributionMock.side_effect = pkg_resources.DistributionNotFound() - self.assertWarnsRegex(UserWarning, "mandatory package 'package' not found", verify_packages, requirement) + self.assertRaisesRegex(MissingPackageError, + "mandatory package 'package' not found", verify_packages, requirement) def test_correct_package_versions(self, getDistributionMock): requirement = 'package==0.1.2\n' \ @@ -37,9 +37,7 @@ def test_correct_package_versions(self, getDistributionMock): moduleMock.version = '0.1.2' getDistributionMock.return_value = moduleMock - with warnings.catch_warnings(record=True) as w: - verify_packages(requirement) - self.assertEqual(0, len(w)) + verify_packages(requirement) getDistributionMock.assert_called_with('package') self.assertEqual(3, len(getDistributionMock.call_args_list)) @@ -51,7 +49,8 @@ def test_wrong_package_version(self, getDistributionMock): moduleMock.version = '0.1.2' getDistributionMock.return_value = moduleMock - self.assertWarnsRegex(UserWarning, re.escape("'package' version mismatch (>0.1.2)"), verify_packages, requirement) + self.assertRaisesRegex(IncorrectPackageVersionError, + re.escape("'package 0.1.2' version mismatch (>0.1.2)"), verify_packages, requirement) def test_outdated_requirement(self, getDistributionMock): requirement = 'package>=0.1' @@ -60,7 +59,8 @@ def test_outdated_requirement(self, getDistributionMock): moduleMock.version = '0.0.9' getDistributionMock.return_value = moduleMock - self.assertWarnsRegex(UserWarning, re.escape("'package' version mismatch (>=0.1)"), verify_packages, requirement) + self.assertRaisesRegex(IncorrectPackageVersionError, + re.escape("'package 0.0.9' version mismatch (>=0.1)"), verify_packages, requirement) def test_too_fresh_requirement(self, getDistributionMock): requirement = 'package==0.1.2' @@ -69,4 +69,5 @@ def test_too_fresh_requirement(self, getDistributionMock): moduleMock.version = '0.1.3' getDistributionMock.return_value = moduleMock - self.assertWarnsRegex(UserWarning, re.escape("'package' version mismatch (==0.1.2)"), verify_packages, requirement) \ No newline at end of file + self.assertRaisesRegex(IncorrectPackageVersionError, + re.escape("'package 0.1.3' version mismatch (==0.1.2)"), verify_packages, requirement) \ No newline at end of file From 3b2eb47641e71a1362704cd419dab50087f38dac Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 09:05:32 +0100 Subject: [PATCH 13/38] FIX issue #163 --- autosklearn/automl.py | 15 +++++++++++- test/test_automl/test_automl.py | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 test/test_automl/test_automl.py diff --git a/autosklearn/automl.py b/autosklearn/automl.py index de0fa82429..aa127b294f 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -418,12 +418,25 @@ def refit(self, X, y): self.ensemble_ is None: self._load_models() + random_state = np.random.RandomState(self._seed) for identifier in self.models_: if identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method - model.fit(X.copy(), y.copy()) + + # try to fit the model. If it fails, shuffle the data. This + # could alleviate the problem in algorithms that depend on + # the ordering of the data. + for i in range(10): + try: + model.fit(X.copy(), y.copy()) + break + except ValueError: + indices = list(range(X.shape[0])) + random_state.shuffle(indices) + X = X[indices] + y = y[indices] self._can_predict = True return self diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py new file mode 100644 index 0000000000..a405d67e5d --- /dev/null +++ b/test/test_automl/test_automl.py @@ -0,0 +1,41 @@ +# -*- encoding: utf-8 -*- +import multiprocessing +import os +import sys +import unittest +import unittest.mock + +import numpy as np + +from autosklearn.util.backend import Backend, BackendContext +from autosklearn.automl import AutoML + +sys.path.append(os.path.dirname(__file__)) +from base import Base + + +class AutoMLTest(Base, unittest.TestCase): + _multiprocess_can_split_ = True + + def test_refit_shuffle_on_fail(self): + output = os.path.join(self.test_dir, '..', '.tmp_refit_shuffle_on_fail') + context = BackendContext(output, output, False, False) + backend = Backend(context) + + failing_model = unittest.mock.Mock() + failing_model.fit.side_effect = [ValueError(), ValueError(), None] + + auto = AutoML(backend, 30, 30) + ensemble_mock = unittest.mock.Mock() + auto.ensemble_ = ensemble_mock + ensemble_mock.get_model_identifiers.return_value = [1] + + auto.models_ = {1: failing_model} + + X = np.array([1, 2, 3]) + y = np.array([1, 2, 3]) + auto.refit(X, y) + + self.assertEqual(failing_model.fit.call_count, 3) + + From c304f3c198501d262b644c5b6812c0ae7420643b Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 09:07:38 +0100 Subject: [PATCH 14/38] MAINT use latest version of SMAC --- autosklearn/automl.py | 41 +- autosklearn/evaluation/__init__.py | 223 ++++--- autosklearn/smbo.py | 745 ++++++++++++++---------- requirements.txt | 2 +- setup.py | 2 +- test/test_automl/test_estimators.py | 10 +- test/test_automl/test_pickle.py | 2 +- test/test_automl/test_start_automl.py | 10 +- test/test_evaluation/test_evaluation.py | 50 +- 9 files changed, 631 insertions(+), 454 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index aa127b294f..f4a28102df 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -5,6 +5,7 @@ import hashlib import io import os +import unittest.mock from ConfigSpace.io import pcs @@ -13,13 +14,15 @@ import scipy.stats from sklearn.base import BaseEstimator from smac.tae.execute_ta_run import StatusType +import smac.stats.stats +from smac.runhistory.runhistory import RunHistory from sklearn.grid_search import _CVScoreTuple from autosklearn.constants import * from autosklearn.data.data_manager_factory import get_data_manager from autosklearn.data.competition_data_manager import CompetitionDataManager from autosklearn.data.xy_data_manager import XYDataManager -from autosklearn.evaluation import resampling, eval_with_limits +from autosklearn.evaluation import resampling, ExecuteTaFuncWithQueue from autosklearn.evaluation import calculate_score from autosklearn.util import StopWatch, get_logger, setup_logger, \ pipeline @@ -234,35 +237,29 @@ def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) - - _info = eval_with_limits(datamanager, self._backend, 1, - self._seed, num_run, - self._resampling_strategy, - self._resampling_strategy_arguments, - memory_limit, time_limit, - logger=self._logger) - if _info[4] == StatusType.SUCCESS: + ta = ExecuteTaFuncWithQueue(backend=self._backend, + autosklearn_seed=self._seed, + resampling_strategy=self._resampling_strategy, + initial_num_run=num_run, + logger=self._logger) + + status, cost, runtime, additional_info = \ + ta.run(1, cutoff=time_limit, memory_limit=memory_limit) + if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 1/2.") else: self._logger.error('Error creating dummy prediction 1/2:%s ', - _info[3]) - - num_run += 1 + additional_info) - _info = eval_with_limits(datamanager, self._backend, 2, - self._seed, num_run, - self._resampling_strategy, - self._resampling_strategy_arguments, - memory_limit, time_limit, - logger=self._logger) - if _info[4] == StatusType.SUCCESS: + status, cost, runtime, additional_info = \ + ta.run(2, cutoff=time_limit, memory_limit=memory_limit) + if status == StatusType.SUCCESS: self._logger.info("Finished creating dummy prediction 2/2.") else: self._logger.error('Error creating dummy prediction 2/2 %s', - _info[3]) + additional_info) - num_run += 1 - return num_run + return ta.num_run def _fit(self, datamanager): # Reset learnt stuff diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 37ca061612..545f62fc93 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -1,6 +1,5 @@ # -*- encoding: utf-8 -*- -from __future__ import absolute_import - +import logging import multiprocessing import sys import time @@ -8,6 +7,7 @@ import pynisher from smac.tae.execute_ta_run import StatusType +from smac.tae.execute_func import AbstractTAFunc from .abstract_evaluator import * from .cv_evaluator import * @@ -19,86 +19,141 @@ WORST_POSSIBLE_RESULT = 2.0 -def _eval_wrapper(queue, config, data, backend, seed, num_run, subsample, - with_predictions, all_scoring_functions, output_y_test, - resampling_strategy, **resampling_strategy_args): - if resampling_strategy == 'holdout': - eval_function = eval_holdout - elif resampling_strategy == 'holdout-iterative-fit': - eval_function = eval_iterative_holdout - elif resampling_strategy == 'cv': - eval_function = eval_cv - elif resampling_strategy == 'partial-cv': - eval_function = eval_partial_cv - elif resampling_strategy == 'test': - eval_function = eval_t - output_y_test = False - with_predictions = False - else: - raise ValueError('Unknown resampling strategy %s' % - resampling_strategy) - - start_time = time.time() - try: - eval_function(queue=queue, config=config, data=data, - backend=backend, seed=seed, num_run=num_run, - subsample=subsample, with_predictions=with_predictions, - all_scoring_functions=all_scoring_functions, - output_y_test=output_y_test, - **resampling_strategy_args) - # We need to catch the 'limit'-exceptions of the pynisher here as well! - except pynisher.TimeoutException as e: - duration = time.time() - start_time - error_message = 'Timeout' - queue.put((duration, WORST_POSSIBLE_RESULT, seed, error_message, - StatusType.TIMEOUT)) - except MemoryError as e: - duration = time.time() - start_time - error_message = 'Memout' - queue.put((duration, WORST_POSSIBLE_RESULT, seed, error_message, - StatusType.MEMOUT)) - except Exception as e: - duration = time.time() - start_time - exc_info = sys.exc_info() - error_message = ''.join(traceback.format_exception(*exc_info)) - queue.put((duration, WORST_POSSIBLE_RESULT, seed, error_message, - StatusType.CRASHED)) - - -def eval_with_limits(datamanager, backend, config, seed, num_run, - resampling_strategy, - resampling_strategy_args, memory_limit, - func_eval_time_limit, subsample=None, - with_predictions=True, - all_scoring_functions=False, - output_y_test=True, - logger=None): - if resampling_strategy_args is None: - resampling_strategy_args = {} - - start_time = time.time() - queue = multiprocessing.Queue() - safe_eval = pynisher.enforce_limits(mem_in_mb=memory_limit, - wall_time_in_s=func_eval_time_limit, - grace_period_in_s=30, - logger=logger)(_eval_wrapper) - - try: - safe_eval(queue=queue, config=config, data=datamanager, - backend=backend, seed=seed, num_run=num_run, - subsample=subsample, - with_predictions=with_predictions, - all_scoring_functions=all_scoring_functions, - output_y_test=output_y_test, - resampling_strategy=resampling_strategy, - **resampling_strategy_args) - info = queue.get(block=True, timeout=2) - except Exception as e0: - error_message = 'Unknown error (%s) %s' % (type(e0), e0) - status = StatusType.CRASHED - - duration = time.time() - start_time - info = (duration, WORST_POSSIBLE_RESULT, seed, error_message, status) - - return info +class ExecuteTaFuncWithQueue(AbstractTAFunc): + + def __init__(self, backend, autosklearn_seed, resampling_strategy, + logger, initial_num_run=1, stats=None, runhistory=None, + run_obj='quality', par_factor=1, with_predictions=True, + all_scoring_functions=False, output_y_test=True, + **resampling_strategy_args): + + if resampling_strategy == 'holdout': + eval_function = eval_holdout + elif resampling_strategy == 'holdout-iterative-fit': + eval_function = eval_iterative_holdout + elif resampling_strategy == 'cv': + eval_function = eval_cv + elif resampling_strategy == 'partial-cv': + eval_function = eval_partial_cv + elif resampling_strategy == 'test': + eval_function = eval_t + output_y_test = False + with_predictions = False + else: + raise ValueError('Unknown resampling strategy %s' % + resampling_strategy) + + super().__init__(ta=eval_function, stats=stats, runhistory=runhistory, + run_obj=run_obj, par_factor=par_factor) + + self.backend = backend + self.autosklearn_seed = autosklearn_seed + self.resampling_strategy = resampling_strategy + self.num_run = initial_num_run + self.resampling_strategy_args = resampling_strategy_args + self.with_predictions = with_predictions + self.all_scoring_functions = all_scoring_functions + self.output_y_test = output_y_test + self.resampling_strategy_args = resampling_strategy_args + self.logger = logger + + def run(self, config, instance=None, + cutoff=None, + memory_limit=None, + seed=12345, + instance_specific="0"): + + D = self.backend.load_datamanager() + queue = multiprocessing.Queue() + + arguments = dict(logger=logging.getLogger("pynisher"), + wall_time_in_s=cutoff, + mem_in_mb=memory_limit) + obj_kwargs = dict(queue=queue, + config=config, + data=D, + backend=self.backend, + seed=self.autosklearn_seed, + num_run=self.num_run, + with_predictions=self.with_predictions, + all_scoring_functions=self.all_scoring_functions, + output_y_test=self.output_y_test, + subsample=None) + + obj = pynisher.enforce_limits(**arguments)(self.ta) + obj(**obj_kwargs) + + if obj.exit_status is pynisher.TimeoutException: + status = StatusType.TIMEOUT + cost = WORST_POSSIBLE_RESULT + additional_run_info = 'Timeout' + elif obj.exit_status is pynisher.MemorylimitException: + status = StatusType.MEMOUT + cost = WORST_POSSIBLE_RESULT + additional_run_info = 'Memout' + else: + try: + info = queue.get(block=True, timeout=2) + result = info[1] + error_message = info[3] + + if obj.exit_status == 0 and result is not None: + status = StatusType.SUCCESS + cost = result + additional_run_info = '' + else: + status = StatusType.CRASHED + cost = WORST_POSSIBLE_RESULT + additional_run_info = error_message + except Exception as e0: + additional_run_info = 'Unknown error (%s) %s' % (type(e0), e0) + status = StatusType.CRASHED + cost = WORST_POSSIBLE_RESULT + + runtime = float(obj.wall_clock_time) + self.num_run += 1 + return status, cost, runtime, additional_run_info + +# def eval_with_limits(config, datamanager, backend, seed, num_run, +# resampling_strategy, +# resampling_strategy_args, memory_limit, +# func_eval_time_limit, subsample=None, +# with_predictions=True, +# all_scoring_functions=False, +# output_y_test=True, +# logger=None, +# # arguments to please SMAC +# instance=None): +# if resampling_strategy_args is None: +# resampling_strategy_args = {} +# +# start_time = time.time() +# queue = multiprocessing.Queue() +# safe_eval = pynisher.enforce_limits(mem_in_mb=memory_limit, +# wall_time_in_s=func_eval_time_limit, +# grace_period_in_s=30, +# logger=logger)(_eval_wrapper) +# +# try: +# safe_eval(queue=queue, config=config, data=datamanager, +# backend=backend, seed=seed, num_run=num_run, +# subsample=subsample, +# with_predictions=with_predictions, +# all_scoring_functions=all_scoring_functions, +# output_y_test=output_y_test, +# resampling_strategy=resampling_strategy, +# **resampling_strategy_args) +# info = queue.get(block=True, timeout=2) +# +# except Exception as e0: +# error_message = 'Unknown error (%s) %s' % (type(e0), e0) +# status = StatusType.CRASHED +# +# duration = time.time() - start_time +# info = (duration, WORST_POSSIBLE_RESULT, seed, error_message, status) +# +# # TODO only return relevant information and make SMAC measure the rest! +# # Currently, everything has the status SUCESS +# #return info +# return info[1], info[3] diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 48549725e8..19011ced9e 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -1,3 +1,4 @@ +import functools import os import time import traceback @@ -8,11 +9,12 @@ # JTS TODO: notify aaron to clean up these nasty nested modules from ConfigSpace.configuration_space import Configuration -from ConfigSpace.util import impute_inactive_values -from smac.smbo.smbo import SMBO, get_types +from smac.facade.smac_facade import SMAC +from smac.utils.util_funcs import get_types from smac.scenario.scenario import Scenario from smac.tae.execute_ta_run import StatusType +from smac.smbo.objective import average_cost from smac.runhistory.runhistory import RunHistory from smac.runhistory.runhistory2epm import RunHistory2EPM4Cost, \ RunHistory2EPM4EIPS @@ -27,7 +29,7 @@ from autosklearn.metalearning.mismbo import suggest_via_metalearning from autosklearn.data.abstract_data_manager import AbstractDataManager from autosklearn.data.competition_data_manager import CompetitionDataManager -from autosklearn.evaluation import eval_with_limits +from autosklearn.evaluation import ExecuteTaFuncWithQueue from autosklearn.util import get_logger from autosklearn.metalearning.metalearning.meta_base import MetaBase from autosklearn.metalearning.metafeatures.metafeatures import \ @@ -170,33 +172,6 @@ def _print_debug_info_of_init_configuration(initial_configurations, basename, basename, time_for_task - watcher.wall_elapsed(basename)) -class AutoMLScenario(Scenario): - """ - We specialize the smac3 scenario here as we would like - to create it in code, without actually reading a smac scenario file - """ - - def __init__(self, config_space, limit, cutoff_time, metafeatures, - output_dir, shared_model): - self.logger = get_logger(self.__class__.__name__) - - # Give SMAC at least 5 seconds - soft_limit = max(5, cutoff_time - 35) - - scenario_dict = {'cs': config_space, - 'run-obj': 'quality', - 'cutoff-time': soft_limit, - 'tuner-timeout': soft_limit, - 'wallclock-limit': limit, - 'features': metafeatures, - 'instances': [[name] for name in metafeatures], - 'output_dir': output_dir, - 'shared_model': shared_model} - - super(AutoMLScenario, self).__init__(scenario_dict) - # reset the logger, because otherwise we can't pickle the AutoMLScenario - self.logger = get_logger(self.__class__.__name__) - class AutoMLSMBO(object): def __init__(self, config_space, dataset_name, @@ -533,108 +508,110 @@ def run_smbo(self, max_iters=1000): # == first things first: load the datamanager self.reset_data_manager() - # == Initialize SMBO stuff + # == Initialize non-SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history - run_history = RunHistory() - meta_runhistory = RunHistory() - meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL - # == Train on subset - # before doing anything, let us run the default_cfg - # on a subset of the available data to ensure that - # we at least have some models - # we will try three different ratios of decreasing magnitude - # in the hope that at least on the last one we will be able - # to get a model - n_data = self.datamanager.data['X_train'].shape[0] - subset_ratio = 10000. / n_data - if subset_ratio >= 0.5: - subset_ratio = 0.33 - subset_ratios = [subset_ratio, subset_ratio * 0.10] - else: - subset_ratios = [subset_ratio, 500. / n_data] - self.logger.info("Training default configurations on a subset of " - "%d/%d data points." % - (int(n_data * subset_ratio), n_data)) - - # the time limit for these function evaluations is rigorously - # set to only 1/2 of a full function evaluation - subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) - # the configs we want to run on the data subset are: - # 1) the default configs - # 2) a set of configs we selected for training on a subset - subset_configs = [self.config_space.get_default_configuration()] \ - + self.collect_additional_subset_defaults() - subset_config_succesful = [False] * len(subset_configs) - for subset_config_id, next_config in enumerate(subset_configs): - for i, ratio in enumerate(subset_ratios): - self.reset_data_manager() - n_data_subsample = int(n_data * ratio) - - # run the config, but throw away the result afterwards - # since this cfg was evaluated only on a subset - # and we don't want to confuse SMAC - self.logger.info("Starting to evaluate %d on SUBSET " - "with size %d and time limit %ds.", - num_run, n_data_subsample, - subset_time_limit) - self.logger.info(next_config) - _info = eval_with_limits( - datamanager=self.datamanager, backend=self.backend, - config=next_config, seed=seed, num_run=num_run, - resampling_strategy=self.resampling_strategy, - resampling_strategy_args=self.resampling_strategy_args, - memory_limit=self.memory_limit, - func_eval_time_limit=subset_time_limit, - subsample=n_data_subsample, - logger=self.logger) - (duration, result, _, additional_run_info, status) = _info - self.logger.info("Finished evaluating %d. configuration on SUBSET. " - "Duration %f; loss %f; status %s; additional run " - "info: %s ", num_run, duration, result, - str(status), additional_run_info) - - num_run += 1 - if i < len(subset_ratios) - 1: - if status != StatusType.SUCCESS: - # Do not increase num_run here, because we will try - # the same configuration with less data - self.logger.info("A CONFIG did not finish " - " for subset ratio %f -> going smaller", - ratio) - continue - else: - self.logger.info("Finished SUBSET training successfully" - " with ratio %f", ratio) - subset_config_succesful[subset_config_id] = True - break - else: - if status != StatusType.SUCCESS: - self.logger.info("A CONFIG did not finish " - " for subset ratio %f.", - ratio) - continue - else: - self.logger.info("Finished SUBSET training successfully" - " with ratio %f", ratio) - subset_config_succesful[subset_config_id] = True - break - - # Use the first non-failing configuration from the subsets as the new - # default configuration -> this guards us against the random forest - # failing on large, sparse datasets - default_cfg = None - for subset_config_id, next_config in enumerate(subset_configs): - if subset_config_succesful[subset_config_id]: - default_cfg = next_config - break - if default_cfg is None: - default_cfg = self.config_space.get_default_configuration() + # # == Train on subset + # # before doing anything, let us run the default_cfg + # # on a subset of the available data to ensure that + # # we at least have some models + # # we will try three different ratios of decreasing magnitude + # # in the hope that at least on the last one we will be able + # # to get a model + # n_data = self.datamanager.data['X_train'].shape[0] + # subset_ratio = 10000. / n_data + # if subset_ratio >= 0.5: + # subset_ratio = 0.33 + # subset_ratios = [subset_ratio, subset_ratio * 0.10] + # else: + # subset_ratios = [subset_ratio, 500. / n_data] + # self.logger.info("Training default configurations on a subset of " + # "%d/%d data points." % + # (int(n_data * subset_ratio), n_data)) + # + # # the time limit for these function evaluations is rigorously + # # set to only 1/2 of a full function evaluation + # subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) + # # the configs we want to run on the data subset are: + # # 1) the default configs + # # 2) a set of configs we selected for training on a subset + # subset_configs = [self.config_space.get_default_configuration()] \ + # + self.collect_additional_subset_defaults() + # subset_config_succesful = [False] * len(subset_configs) + # for subset_config_id, next_config in enumerate(subset_configs): + # for i, ratio in enumerate(subset_ratios): + # self.reset_data_manager() + # n_data_subsample = int(n_data * ratio) + # + # # run the config, but throw away the result afterwards + # # since this cfg was evaluated only on a subset + # # and we don't want to confuse SMAC + # self.logger.info("Starting to evaluate %d on SUBSET " + # "with size %d and time limit %ds.", + # num_run, n_data_subsample, + # subset_time_limit) + # self.logger.info(next_config) + # _info = eval_with_limits( + # datamanager=self.datamanager, backend=self.backend, + # config=next_config, seed=seed, num_run=num_run, + # resampling_strategy=self.resampling_strategy, + # resampling_strategy_args=self.resampling_strategy_args, + # memory_limit=self.memory_limit, + # func_eval_time_limit=subset_time_limit, + # subsample=n_data_subsample, + # logger=self.logger) + # (duration, result, _, additional_run_info, status) = _info + # self.logger.info("Finished evaluating %d. configuration on SUBSET. " + # "Duration %f; loss %f; status %s; additional run " + # "info: %s ", num_run, duration, result, + # str(status), additional_run_info) + # + # num_run += 1 + # if i < len(subset_ratios) - 1: + # if status != StatusType.SUCCESS: + # # Do not increase num_run here, because we will try + # # the same configuration with less data + # self.logger.info("A CONFIG did not finish " + # " for subset ratio %f -> going smaller", + # ratio) + # continue + # else: + # self.logger.info("Finished SUBSET training successfully" + # " with ratio %f", ratio) + # subset_config_succesful[subset_config_id] = True + # break + # else: + # if status != StatusType.SUCCESS: + # self.logger.info("A CONFIG did not finish " + # " for subset ratio %f.", + # ratio) + # continue + # else: + # self.logger.info("Finished SUBSET training successfully" + # " with ratio %f", ratio) + # subset_config_succesful[subset_config_id] = True + # break + # + # # Use the first non-failing configuration from the subsets as the new + # # default configuration -> this guards us against the random forest + # # failing on large, sparse datasets + # default_cfg = None + # for subset_config_id, next_config in enumerate(subset_configs): + # if subset_config_succesful[subset_config_id]: + # default_cfg = next_config + # break + # if default_cfg is None: + # default_cfg = self.config_space.get_default_configuration() + + # Initialize some SMAC dependencies + run_history = RunHistory(aggregate_func=average_cost) + meta_runhistory = RunHistory(aggregate_func=average_cost) + meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again @@ -734,30 +711,30 @@ def run_smbo(self, max_iters=1000): self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' - for meta_dataset in meta_runs.index: - meta_dataset_start_index = meta_runs_index - for meta_configuration in meta_runs.columns: - if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): - try: - config = meta_base.get_configuration_from_algorithm_index( - meta_configuration) - cost = meta_runs.loc[meta_dataset, meta_configuration] - if read_runtime_data: - runtime = meta_durations.loc[meta_dataset, - meta_configuration] - else: - runtime = 1 - # TODO read out other status types! - meta_runhistory.add(config, cost, runtime, - StatusType.SUCCESS, - instance_id=meta_dataset) - meta_runs_index += 1 - except: - # TODO maybe add warning - pass - - meta_runs_dataset_indices[meta_dataset] = ( - meta_dataset_start_index, meta_runs_index) + # for meta_dataset in meta_runs.index: + # meta_dataset_start_index = meta_runs_index + # for meta_configuration in meta_runs.columns: + # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): + # try: + # config = meta_base.get_configuration_from_algorithm_index( + # meta_configuration) + # cost = meta_runs.loc[meta_dataset, meta_configuration] + # if read_runtime_data: + # runtime = meta_durations.loc[meta_dataset, + # meta_configuration] + # else: + # runtime = 1 + # # TODO read out other status types! + # meta_runhistory.add(config, cost, runtime, + # StatusType.SUCCESS, + # instance_id=meta_dataset) + # meta_runs_index += 1 + # except: + # # TODO maybe add warning + # pass + # + # meta_runs_dataset_indices[meta_dataset] = ( + # meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') @@ -766,12 +743,23 @@ def run_smbo(self, max_iters=1000): meta_features_dict = {} metalearning_configurations = [] - self.scenario = AutoMLScenario(config_space=self.config_space, - limit=self.total_walltime_limit, - cutoff_time=self.func_eval_time_limit, - metafeatures=meta_features_dict, - output_dir=self.backend.temporary_directory, - shared_model=self.shared_mode) + self.scenario = Scenario({'cs': self.config_space, + 'wallclock-limit': self.total_walltime_limit, + 'instances': [[name] for name in meta_features_dict], + 'output-dir': self.backend.temporary_directory, + 'shared-model': self.shared_mode, + 'run-obj': 'quality'}) + + # TODO rebuild target algorithm to be it's own target algorithm + # evaluator, which takes into account that a run can be killed prior + # to the model being fully fitted; thus putting intermediate results + # into a queue and querying them once the time is over + ta = ExecuteTaFuncWithQueue(backend=self.backend, + autosklearn_seed=seed, + resampling_strategy=self.resampling_strategy, + initial_num_run=num_run, + logger=self.logger, + **self.resampling_strategy_args) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': @@ -783,8 +771,11 @@ def run_smbo(self, max_iters=1000): model = RandomForestWithInstances(types, instance_features=meta_features_list, seed=1, num_trees=10) - smac = SMBO(self.scenario, model=model, - rng=seed) + smac = SMAC(scenario=self.scenario, + model=model, + rng=seed, + tae_runner=ta, + runhistory=run_history) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, @@ -795,13 +786,17 @@ def run_smbo(self, max_iters=1000): ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) - smac = SMBO(self.scenario, + smac = SMAC(scenario=self.scenario, tae_runner=ta, acquisition_function=acquisition_function, - model=model, runhistory2epm=rh2EPM, rng=seed) + model=model, runhistory2epm=rh2EPM, rng=seed, + runhistory=run_history) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) + smac.solver.stats.start_timing() + smac.solver.incumbent = smac.solver.initial_design.run() + # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, @@ -813,85 +808,118 @@ def run_smbo(self, max_iters=1000): # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) - X_meta, Y_meta = rh2EPM.transform(meta_runhistory) - # Transform Y_meta on a per-dataset base - for meta_dataset in meta_runs_dataset_indices: - start_index, end_index = meta_runs_dataset_indices[meta_dataset] - end_index += 1 # Python indexing - Y_meta[start_index:end_index, 0]\ - [Y_meta[start_index:end_index, 0] >2.0] = 2.0 - dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) - Y_meta[start_index:end_index, 0] = 1 - ( - (1. - Y_meta[start_index:end_index, 0]) / - (1. - dataset_minimum)) - Y_meta[start_index:end_index, 0]\ - [Y_meta[start_index:end_index, 0] > 2] = 2 - + # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) + # # Transform Y_meta on a per-dataset base + # for meta_dataset in meta_runs_dataset_indices: + # start_index, end_index = meta_runs_dataset_indices[meta_dataset] + # end_index += 1 # Python indexing + # Y_meta[start_index:end_index, 0]\ + # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 + # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) + # Y_meta[start_index:end_index, 0] = 1 - ( + # (1. - Y_meta[start_index:end_index, 0]) / + # (1. - dataset_minimum)) + # Y_meta[start_index:end_index, 0]\ + # [Y_meta[start_index:end_index, 0] > 2] = 2 + + smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations - finished = False - for i, next_config in enumerate(([default_cfg] + - metalearning_configurations)): - # Do not evaluate default configurations more than once - if i >= len([default_cfg]) and next_config in [default_cfg]: - continue - - config_name = 'meta-learning' if i >= len([default_cfg]) \ - else 'default' - - self.logger.info("Starting to evaluate %d. configuration " - "(%s configuration) with time limit %ds.", - num_run, config_name, self.func_eval_time_limit) - self.logger.info(next_config) - self.reset_data_manager() - info = eval_with_limits(datamanager=self.datamanager, - backend=self.backend, - config=next_config, - seed=seed, num_run=num_run, - resampling_strategy=self.resampling_strategy, - resampling_strategy_args=self.resampling_strategy_args, - memory_limit=self.memory_limit, - func_eval_time_limit=self.func_eval_time_limit, - logger=self.logger) - (duration, result, _, additional_run_info, status) = info - run_history.add(config=next_config, cost=result, - time=duration, status=status, - instance_id=instance_id, seed=seed, - additional_info=additional_run_info) - run_history.update_cost(next_config, result) - self.logger.info("Finished evaluating %d. configuration. " - "Duration %f; loss %f; status %s; additional run " - "info: %s ", num_run, duration, result, - str(status), additional_run_info) - num_run += 1 - if smac.incumbent is None: - smac.incumbent = next_config - elif result < run_history.get_cost(smac.incumbent): - smac.incumbent = next_config - - if self.scenario.shared_model: + smac.solver.incumbent = smac.solver.initial_design.run() + runkey = list(run_history.data.keys())[-1] + runvalue = run_history.data[runkey] + # self.logger.info("Finished evaluating %d. configuration. " + # "Duration %f; loss %f; status %s; additional run " + # "info: %s ", num_run, runvalue.time, runvalue.cost, + # str(runvalue.status), runvalue.additional_info) + + for challenger in metalearning_configurations: + + smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( + challengers=[challenger], + incumbent=smac.solver.incumbent, + run_history=smac.solver.runhistory, + aggregate_func=smac.solver.aggregate_func, + time_bound=self.total_walltime_limit) + + if smac.solver.scenario.shared_model: pSMAC.write(run_history=run_history, - output_directory=self.scenario.output_dir, + output_directory=smac.solver.scenario.output_dir, num_run=self.seed) - if self.watcher.wall_elapsed( - 'SMBO') > self.total_walltime_limit: - finished = True + runkey = list(run_history.data.keys())[-1] + runvalue = run_history.data[runkey] + # self.logger.info("Finished evaluating %d. configuration. " + # "Duration %f; loss %f; status %s; additional run " + # "info: %s ", num_run, runvalue.time, runvalue.cost, + # str(runvalue.status), runvalue.additional_info) - if finished: + if smac.solver.stats.is_budget_exhausted(): break + # TODO print SMAC stats + + # finished = False + # for i, next_config in enumerate(([default_cfg] + + # metalearning_configurations)): + # # Do not evaluate default configurations more than once + # if i >= len([default_cfg]) and next_config in [default_cfg]: + # continue + # + # config_name = 'meta-learning' if i >= len([default_cfg]) \ + # else 'default' + # + # self.logger.info("Starting to evaluate %d. configuration " + # "(%s configuration) with time limit %ds.", + # num_run, config_name, self.func_eval_time_limit) + # self.logger.info(next_config) + # self.reset_data_manager() + # info = eval_with_limits(datamanager=self.datamanager, + # backend=self.backend, + # config=next_config, + # seed=seed, num_run=num_run, + # resampling_strategy=self.resampling_strategy, + # resampling_strategy_args=self.resampling_strategy_args, + # memory_limit=self.memory_limit, + # func_eval_time_limit=self.func_eval_time_limit, + # logger=self.logger) + # (duration, result, _, additional_run_info, status) = info + # run_history.add(config=next_config, cost=result, + # time=duration, status=status, + # instance_id=instance_id, seed=seed, + # additional_info=additional_run_info) + # run_history.update_cost(next_config, result) + # self.logger.info("Finished evaluating %d. configuration. " + # "Duration %f; loss %f; status %s; additional run " + # "info: %s ", num_run, duration, result, + # str(status), additional_run_info) + # num_run += 1 + # if smac.incumbent is None: + # smac.incumbent = next_config + # elif result < run_history.get_cost(smac.incumbent): + # smac.incumbent = next_config + # + # if self.scenario.shared_model: + # pSMAC.write(run_history=run_history, + # output_directory=self.scenario.output_dir, + # num_run=self.seed) + # + # if self.watcher.wall_elapsed( + # 'SMBO') > self.total_walltime_limit: + # finished = True + # + # if finished: + # break + # == after metalearning run SMAC loop - smac.runhistory = run_history - smac_iter = 0 - while not finished: - if self.scenario.shared_model: + while True: + if smac.solver.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) - next_configs = [] - time_for_choose_next = -1 + challengers = [] + choose_next_start_time = time.time() try: X_cfg, Y_cfg = rh2EPM.transform(run_history) @@ -902,113 +930,194 @@ def run_smbo(self, max_iters=1000): (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 - if len(X_meta) > 0 and len(X_cfg) > 0: - pass - #X_cfg = np.concatenate((X_meta, X_cfg)) - #Y_cfg = np.concatenate((Y_meta, Y_cfg)) - elif len(X_meta) > 0: - X_cfg = X_meta.copy() - Y_cfg = Y_meta.copy() - elif len(X_cfg) > 0: - X_cfg = X_cfg.copy() - Y_cfg = Y_cfg.copy() - else: - raise ValueError('No training data for SMAC random forest!') + #if len(X_meta) > 0 and len(X_cfg) > 0: + # pass + # X_cfg = np.concatenate((X_meta, X_cfg)) + # Y_cfg = np.concatenate((Y_meta, Y_cfg)) + #elif len(X_meta) > 0: + # X_cfg = X_meta.copy() + # Y_cfg = Y_meta.copy() + #elif len(X_cfg) > 0: + X_cfg = X_cfg.copy() + Y_cfg = Y_cfg.copy() + #else: + # raise ValueError('No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() - next_configs_tmp = smac.choose_next(X_cfg, Y_cfg, - num_interleaved_random=110, - num_configurations_by_local_search=10, - num_configurations_by_random_search_sorted=100) + next_configs_tmp = smac.solver.choose_next( + X_cfg, Y_cfg, num_interleaved_random=110, + num_configurations_by_local_search=10, + num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) - next_configs.extend(next_configs_tmp) - # TODO put Exception here! + challengers.extend(next_configs_tmp) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() - next_configs.append(next_config) - - models_fitted_this_iteration = 0 - start_time_this_iteration = time.time() - for next_config in next_configs: - x_runtime = impute_inactive_values(next_config) - x_runtime = impute_inactive_values(x_runtime).get_array() - # predicted_runtime = runtime_rf.predict_marginalized_over_instances( - # x_runtime.reshape((1, -1))) - # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 - - self.logger.info("Starting to evaluate %d. configuration (from " - "SMAC) with time limit %ds.", num_run, - self.func_eval_time_limit) - self.logger.info(next_config) - self.reset_data_manager() - info = eval_with_limits(datamanager=self.datamanager, - backend=self.backend, - config=next_config, - seed=seed, num_run=num_run, - resampling_strategy=self.resampling_strategy, - resampling_strategy_args=self.resampling_strategy_args, - memory_limit=self.memory_limit, - func_eval_time_limit=self.func_eval_time_limit, - logger=self.logger) - (duration, result, _, additional_run_info, status) = info - run_history.add(config=next_config, cost=result, - time=duration, status=status, - instance_id=instance_id, seed=seed, - additional_info=additional_run_info) - run_history.update_cost(next_config, result) - - #self.logger.info('Predicted runtime %g, true runtime %g', - # predicted_runtime, duration) - - # TODO add unittest to make sure everything works fine and - # this does not get outdated! - if smac.incumbent is None: - smac.incumbent = next_config - elif result < run_history.get_cost(smac.incumbent): - smac.incumbent = next_config - - self.logger.info("Finished evaluating %d. configuration. " - "Duration: %f; loss: %f; status %s; additional " - "run info: %s ", num_run, duration, result, - str(status), additional_run_info) - smac_iter += 1 - num_run += 1 - - models_fitted_this_iteration += 1 - time_used_this_iteration = time.time() - start_time_this_iteration - - if max_iters is not None: - finished = (smac_iter >= max_iters) - - if self.watcher.wall_elapsed( - 'SMBO') > self.total_walltime_limit: - finished = True - - if models_fitted_this_iteration >= 2 and \ - time_for_choose_next > 0 and \ - time_used_this_iteration > time_for_choose_next: - break - elif time_for_choose_next <= 0 and \ - models_fitted_this_iteration >= 1: - break - elif models_fitted_this_iteration >= 50: - break - - if finished: - break - - if self.scenario.shared_model: + challengers.append(next_config) + time_for_choose_next = time.time() - choose_next_start_time + + # self.logger.info("Finished evaluating %d. configuration. " + # "Duration: %f; loss: %f; status %s; additional " + # "run info: %s ", num_run, duration, result, + # str(status), additional_run_info) + + smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( + challengers=challengers, + incumbent=smac.solver.incumbent, + run_history=smac.solver.runhistory, + aggregate_func=smac.solver.aggregate_func, + time_bound=time_for_choose_next) + + if smac.solver.scenario.shared_model: pSMAC.write(run_history=run_history, - output_directory=self.scenario.output_dir, + output_directory=smac.solver.scenario.output_dir, num_run=self.seed) + # runkey = list(run_history.data.keys())[-1] + # runvalue = run_history.data[runkey] + # self.logger.info("Finished evaluating %d. configuration. " + # "Duration %f; loss %f; status %s; additional run " + # "info: %s ", num_run, runvalue.time, runvalue.cost, + # str(runvalue.status), runvalue.additional_info) + + if smac.solver.stats.is_budget_exhausted(): + break + self.runhistory = run_history - + + # smac.runhistory = run_history + # smac_iter = 0 + # while not finished: + # if self.scenario.shared_model: + # pSMAC.read(run_history=run_history, + # output_directory=self.scenario.output_dir, + # configuration_space=self.config_space, + # logger=self.logger) + # + # next_configs = [] + # time_for_choose_next = -1 + # try: + # X_cfg, Y_cfg = rh2EPM.transform(run_history) + # + # if not run_history.empty(): + # # Update costs by normalization + # dataset_minimum = np.min(Y_cfg[:, 0]) + # Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / + # (1. - dataset_minimum)) + # Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 + # + # if len(X_meta) > 0 and len(X_cfg) > 0: + # pass + # #X_cfg = np.concatenate((X_meta, X_cfg)) + # #Y_cfg = np.concatenate((Y_meta, Y_cfg)) + # elif len(X_meta) > 0: + # X_cfg = X_meta.copy() + # Y_cfg = Y_meta.copy() + # elif len(X_cfg) > 0: + # X_cfg = X_cfg.copy() + # Y_cfg = Y_cfg.copy() + # else: + # raise ValueError('No training data for SMAC random forest!') + # + # self.logger.info('Using %d training points for SMAC.' % + # X_cfg.shape[0]) + # choose_next_start_time = time.time() + # next_configs_tmp = smac.solver.choose_next( + # X_cfg, Y_cfg, num_interleaved_random=110, + # num_configurations_by_local_search=10, + # num_configurations_by_random_search_sorted=100) + # time_for_choose_next = time.time() - choose_next_start_time + # self.logger.info('Used %g seconds to find next ' + # 'configurations' % (time_for_choose_next)) + # next_configs.extend(next_configs_tmp) + # except Exception as e: + # self.logger.error(e) + # self.logger.error("Error in getting next configurations " + # "with SMAC. Using random configuration!") + # next_config = self.config_space.sample_configuration() + # next_configs.append(next_config) + # + # models_fitted_this_iteration = 0 + # start_time_this_iteration = time.time() + # for next_config in next_configs: + # #x_runtime = impute_inactive_values(next_config) + # #x_runtime = impute_inactive_values(x_runtime).get_array() + # # predicted_runtime = runtime_rf.predict_marginalized_over_instances( + # # x_runtime.reshape((1, -1))) + # # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 + # + # self.logger.info("Starting to evaluate %d. configuration (from " + # "SMAC) with time limit %ds.", num_run, + # self.func_eval_time_limit) + # self.logger.info(next_config) + # self.reset_data_manager() + # info = eval_with_limits(datamanager=self.datamanager, + # backend=self.backend, + # config=next_config, + # seed=seed, num_run=num_run, + # resampling_strategy=self.resampling_strategy, + # resampling_strategy_args=self.resampling_strategy_args, + # memory_limit=self.memory_limit, + # func_eval_time_limit=self.func_eval_time_limit, + # logger=self.logger) + # (duration, result, _, additional_run_info, status) = info + # run_history.add(config=next_config, cost=result, + # time=duration, status=status, + # instance_id=instance_id, seed=seed, + # additional_info=additional_run_info) + # run_history.update_cost(next_config, result) + # + # #self.logger.info('Predicted runtime %g, true runtime %g', + # # predicted_runtime, duration) + # + # # TODO add unittest to make sure everything works fine and + # # this does not get outdated! + # if smac.incumbent is None: + # smac.incumbent = next_config + # elif result < run_history.get_cost(smac.incumbent): + # smac.incumbent = next_config + # + # self.logger.info("Finished evaluating %d. configuration. " + # "Duration: %f; loss: %f; status %s; additional " + # "run info: %s ", num_run, duration, result, + # str(status), additional_run_info) + # smac_iter += 1 + # num_run += 1 + # + # models_fitted_this_iteration += 1 + # time_used_this_iteration = time.time() - start_time_this_iteration + # + # if max_iters is not None: + # finished = (smac_iter >= max_iters) + # + # if self.watcher.wall_elapsed( + # 'SMBO') > self.total_walltime_limit: + # finished = True + # + # if models_fitted_this_iteration >= 2 and \ + # time_for_choose_next > 0 and \ + # time_used_this_iteration > time_for_choose_next: + # break + # elif time_for_choose_next <= 0 and \ + # models_fitted_this_iteration >= 1: + # break + # elif models_fitted_this_iteration >= 50: + # break + # + # if finished: + # break + # + # if self.scenario.shared_model: + # pSMAC.write(run_history=run_history, + # output_directory=self.scenario.output_dir, + # num_run=self.seed) + # + # self.runhistory = run_history + # diff --git a/requirements.txt b/requirements.txt index eeb4c7fa55..dee4d27145 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ xgboost==0.4a30 ConfigSpace pynisher>=0.4 pyrfr -smac==0.0.1 +smac==0.2.0 diff --git a/setup.py b/setup.py index 7e8aaa6171..1618fae12b 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ "ConfigSpace", "pynisher>=0.4", "pyrfr", - "smac==0.0.1" + "smac==0.2.0" ] diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 81e73658fa..2815347ae0 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -43,7 +43,7 @@ def test_fit(self): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=15, + per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) @@ -103,7 +103,7 @@ def test_fit_pSMAC(self): Y_test = Y_test + 1 automl = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=15, + per_run_time_limit=5, output_folder=output, tmp_folder=output, shared_mode=True, @@ -138,7 +138,7 @@ def test_fit_pSMAC(self): backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=15, + per_run_time_limit=5, output_folder=output, tmp_folder=output, shared_mode=True, @@ -173,7 +173,7 @@ def test_grid_scores(self): self._setUp(output) cls = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=15, + per_run_time_limit=5, output_folder=output, tmp_folder=output, shared_mode=False, @@ -212,7 +212,7 @@ def test_cv_results(self): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') cls = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=15, + per_run_time_limit=5, output_folder=output, tmp_folder=output, shared_mode=False, diff --git a/test/test_automl/test_pickle.py b/test/test_automl/test_pickle.py index 0206a0d8cf..6db44f5a15 100644 --- a/test/test_automl/test_pickle.py +++ b/test/test_automl/test_pickle.py @@ -19,7 +19,7 @@ def test_can_pickle_classifier(self): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=15, + per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) diff --git a/test/test_automl/test_start_automl.py b/test/test_automl/test_start_automl.py index 0221e39b67..0ad5ba30df 100644 --- a/test/test_automl/test_start_automl.py +++ b/test/test_automl/test_start_automl.py @@ -1,6 +1,4 @@ # -*- encoding: utf-8 -*- -from __future__ import print_function - import multiprocessing import os import sys @@ -30,7 +28,7 @@ def test_fit(self): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) - automl = autosklearn.automl.AutoML(backend_api, 15, 15) + automl = autosklearn.automl.AutoML(backend_api, 15, 5) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) @@ -57,7 +55,7 @@ def test_binary_score(self): Y_test = data[1][700:] backend_api = backend.create(output, output) - automl = autosklearn.automl.AutoML(backend_api, 15, 15) + automl = autosklearn.automl.AutoML(backend_api, 15, 5) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) self.assertEqual(automl._task, BINARY_CLASSIFICATION) @@ -78,7 +76,7 @@ def test_automl_outputs(self): backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( - backend_api, 15, 15, + backend_api, 15, 5, initial_configurations_via_metalearning=25, seed=100) auto.fit_automl_dataset(dataset) @@ -131,7 +129,7 @@ def test_do_dummy_prediction(self): backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( - backend_api, 15, 15, + backend_api, 15, 5, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index 0bc7ab03c3..abf3c8825c 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -1,4 +1,5 @@ import os +import logging import shutil import sys import time @@ -18,18 +19,24 @@ from smac.tae.execute_ta_run import StatusType from evaluation_util import get_multiclass_classification_datamanager -from autosklearn.evaluation import eval_with_limits +from autosklearn.evaluation import ExecuteTaFuncWithQueue def safe_eval_success_mock(*args, **kwargs): queue = kwargs['queue'] - queue.put((0.1, 1.0, 1, '', StatusType.SUCCESS)) + queue.put((StatusType.SUCCESS, 0.5, 0.12345, '')) + + +class BackendMock(object): + def load_datamanager(self): + return get_multiclass_classification_datamanager() class EvaluationTest(unittest.TestCase): def setUp(self): self.datamanager = get_multiclass_classification_datamanager() self.tmp = os.path.join(os.getcwd(), '.test_evaluation') + self.logger = logging.getLogger() try: shutil.rmtree(self.tmp) @@ -63,32 +70,43 @@ def run_over_time(): @mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout(self, pynisher_mock): pynisher_mock.side_effect = safe_eval_success_mock - info = eval_with_limits(self.datamanager, self.tmp, None, 1, 1, - 'holdout', {}, 3000, 30) - self.assertEqual(info[1], 1.0) - self.assertEqual(info[2], 1) - self.assertEqual(info[4], StatusType.SUCCESS) + ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, + resampling_strategy='holdout', + logger=self.logger) + info = ta.run(None, cutoff=30, memory_limit=3000) + self.assertEqual(info[0], StatusType.SUCCESS) + self.assertEqual(info[1], 0.5) + self.assertIsInstance(info[2], float) @mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): pynisher_mock.return_value = None - info = eval_with_limits(self.datamanager, self.tmp, None, 1, 1, - 'holdout', {}, 3000, 30) + ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, + resampling_strategy='holdout', + logger=self.logger) + info = ta.run(None, cutoff=30, memory_limit=3000) + self.assertEqual(info[0], StatusType.CRASHED) self.assertEqual(info[1], 2.0) - self.assertEqual(info[4], StatusType.CRASHED) + self.assertIsInstance(info[2], float) @mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): pynisher_mock.side_effect = MemoryError - info = eval_with_limits(self.datamanager, self.tmp, None, 1, 1, - 'holdout', {}, 3000, 30) + ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, + resampling_strategy='holdout', + logger=self.logger) + info = ta.run(None, cutoff=30, memory_limit=3000) + self.assertEqual(info[0], StatusType.MEMOUT) self.assertEqual(info[1], 2.0) - self.assertEqual(info[4], StatusType.MEMOUT) + self.assertIsInstance(info[2], float) @mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): pynisher_mock.side_effect = pynisher.TimeoutException - info = eval_with_limits(self.datamanager, self.tmp, None, 1, 1, - 'holdout', {}, 3000, 30) + ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, + resampling_strategy='holdout', + logger=self.logger) + info = ta.run(None, cutoff=30, memory_limit=3000) + self.assertEqual(info[0], StatusType.TIMEOUT) self.assertEqual(info[1], 2.0) - self.assertEqual(info[4], StatusType.TIMEOUT) + self.assertIsInstance(info[2], float) From d11978b476775442a57be191573852fc629003a0 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 16:24:28 +0100 Subject: [PATCH 15/38] MAINT specify python version for circle-ci --- circle.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/circle.yml b/circle.yml index 2ecd682cbd..095c7195c9 100644 --- a/circle.yml +++ b/circle.yml @@ -1,4 +1,7 @@ machine: + python: + version: 3.4.3 + environment: # The github organization or username of the repository which hosts the # project and documentation. From 677750061a4f5fc1caf841e6ead469fab09784aa Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 16:26:49 +0100 Subject: [PATCH 16/38] MAINT increase required SMAC version in init file --- autosklearn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py index 209195e1aa..639d87da9e 100644 --- a/autosklearn/__init__.py +++ b/autosklearn/__init__.py @@ -5,7 +5,7 @@ __MANDATORY_PACKAGES__ = ''' scikit-learn==0.17.1 -smac==0.0.1 +smac==0.2 lockfile>=0.10 ConfigSpace>=0.2.1 pyrfr==0.2.0 From eef057a83a19cc9e50e877c78ba36c7d8e7cb00f Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 16:44:36 +0100 Subject: [PATCH 17/38] MAINT/CI upgrade pip and wheel version --- circle.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/circle.yml b/circle.yml index 095c7195c9..a63ccfe364 100644 --- a/circle.yml +++ b/circle.yml @@ -25,6 +25,8 @@ dependencies: - sudo apt-get update - sudo apt-get install libatlas-dev libatlas3gf-base - sudo apt-get install build-essential python-dev python-setuptools + # upgrade pip and wheel to allow for fast installation + - pip install wheel pip --upgrade # install numpy first as it is a compile time dependency for other packages - pip install --upgrade numpy # install documentation building dependencies From 41ff02a6267604aa5cb379b206adb0d2a237dc42 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 16:57:06 +0100 Subject: [PATCH 18/38] MAINT make SMAC requirements stricter --- autosklearn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py index 639d87da9e..ac55aa5e46 100644 --- a/autosklearn/__init__.py +++ b/autosklearn/__init__.py @@ -5,7 +5,7 @@ __MANDATORY_PACKAGES__ = ''' scikit-learn==0.17.1 -smac==0.2 +smac==0.2.0 lockfile>=0.10 ConfigSpace>=0.2.1 pyrfr==0.2.0 From df619b5619ff634ecbcd8281bda80cf923520a60 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 17:19:06 +0100 Subject: [PATCH 19/38] FIX reduce issue #160 by checking whether file needs to be locked --- autosklearn/util/backend.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/autosklearn/util/backend.py b/autosklearn/util/backend.py index 8003bf3282..82802045d1 100644 --- a/autosklearn/util/backend.py +++ b/autosklearn/util/backend.py @@ -180,6 +180,19 @@ def save_targets_ensemble(self, targets): filepath = self._get_targets_ensemble_filename() + # Try to open the file without locking it, this will reduce the + # number of times where we erronously keep a lock on the ensemble + # targets file although the process already was killed + try: + existing_targets = np.load(filepath) + if existing_targets.shape[0] > targets.shape[0] or \ + (existing_targets.shape == targets.shape and + np.allclose(existing_targets, targets)): + + return filepath + except Exception: + pass + lock_path = filepath + '.lock' with lockfile.LockFile(lock_path): if os.path.exists(filepath): From ad9801d925aa0d615d7f642fe7a05a5f024e8442 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 17:50:38 +0100 Subject: [PATCH 20/38] FIX timing issues (missing cutoff, too low timeout for dummy predictions, removed second dummy prediction --- autosklearn/automl.py | 28 +++++++++++++++------------- autosklearn/evaluation/__init__.py | 4 ++-- autosklearn/smbo.py | 2 ++ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index f4a28102df..74d69c82f4 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -74,7 +74,8 @@ def __init__(self, self._include_estimators = include_estimators self._include_preprocessors = include_preprocessors self._resampling_strategy = resampling_strategy - self._resampling_strategy_arguments = resampling_strategy_arguments + self._resampling_strategy_arguments = resampling_strategy_arguments \ + if resampling_strategy_arguments is not None else {} self._max_iter_smac = max_iter_smac #self.delete_tmp_folder_after_terminate = \ # delete_tmp_folder_after_terminate @@ -235,29 +236,30 @@ def _print_load_time(basename, time_left_for_this_task, def _do_dummy_prediction(self, datamanager, num_run): self._logger.info("Starting to create dummy predictions.") - time_limit = int(self._time_for_task / 6.) + # time_limit = int(self._time_for_task / 6.) memory_limit = int(self._ml_memory_limit) ta = ExecuteTaFuncWithQueue(backend=self._backend, autosklearn_seed=self._seed, resampling_strategy=self._resampling_strategy, initial_num_run=num_run, - logger=self._logger) + logger=self._logger, + **self._resampling_strategy_arguments) status, cost, runtime, additional_info = \ - ta.run(1, cutoff=time_limit, memory_limit=memory_limit) + ta.run(1, cutoff=self._time_for_task, memory_limit=memory_limit) if status == StatusType.SUCCESS: - self._logger.info("Finished creating dummy prediction 1/2.") + self._logger.info("Finished creating dummy predictions.") else: - self._logger.error('Error creating dummy prediction 1/2:%s ', + self._logger.error('Error creating dummy predictions:%s ', additional_info) - status, cost, runtime, additional_info = \ - ta.run(2, cutoff=time_limit, memory_limit=memory_limit) - if status == StatusType.SUCCESS: - self._logger.info("Finished creating dummy prediction 2/2.") - else: - self._logger.error('Error creating dummy prediction 2/2 %s', - additional_info) + #status, cost, runtime, additional_info = \ + # ta.run(2, cutoff=time_limit, memory_limit=memory_limit) + #if status == StatusType.SUCCESS: + # self._logger.info("Finished creating dummy prediction 2/2.") + #else: + # self._logger.error('Error creating dummy prediction 2/2 %s', + # additional_info) return ta.num_run diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 545f62fc93..3f335c1b15 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -54,7 +54,6 @@ def __init__(self, backend, autosklearn_seed, resampling_strategy, self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions self.output_y_test = output_y_test - self.resampling_strategy_args = resampling_strategy_args self.logger = logger def run(self, config, instance=None, @@ -78,7 +77,8 @@ def run(self, config, instance=None, with_predictions=self.with_predictions, all_scoring_functions=self.all_scoring_functions, output_y_test=self.output_y_test, - subsample=None) + subsample=None, + **self.resampling_strategy_args) obj = pynisher.enforce_limits(**arguments)(self.ta) obj(**obj_kwargs) diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 19011ced9e..f06e1a7679 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -744,6 +744,8 @@ def run_smbo(self, max_iters=1000): metalearning_configurations = [] self.scenario = Scenario({'cs': self.config_space, + 'cutoff-time': self.func_eval_time_limit, + 'memory-limit': self.memory_limit, 'wallclock-limit': self.total_walltime_limit, 'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, From dabbf7d3de983d56c564e599b1abad3291c6ab71 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 17:53:49 +0100 Subject: [PATCH 21/38] FIX unittest --- test/test_automl/test_start_automl.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/test_automl/test_start_automl.py b/test/test_automl/test_start_automl.py index 0ad5ba30df..722a5f710c 100644 --- a/test/test_automl/test_start_automl.py +++ b/test/test_automl/test_start_automl.py @@ -146,9 +146,5 @@ def test_do_dummy_prediction(self): output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00001.npy'))) - self.assertTrue(os.path.exists(os.path.join( - output, '.auto-sklearn', 'predictions_ensemble', - 'predictions_ensemble_1_00002.npy'))) - del auto self._tearDown(output) From 5a07d4391db7f51f698e8ea180c901b120643462 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 18:01:28 +0100 Subject: [PATCH 22/38] MAINT remove duplicate test --- test/automl/test_models.py | 49 -------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 test/automl/test_models.py diff --git a/test/automl/test_models.py b/test/automl/test_models.py deleted file mode 100644 index f84ec68e67..0000000000 --- a/test/automl/test_models.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function -import unittest -import mock -from autosklearn.automl import AutoML -from autosklearn.util.backend import Backend - - -class AutoMLStub(AutoML): - - def __init__(self): - self.__class__ = AutoML - - -class AutoMlModelsTest(unittest.TestCase): - - def setUp(self): - self.automl = AutoMLStub() - self.automl._shared_mode = False - self.automl._seed = 42 - self.automl._backend = mock.Mock(spec=Backend) - self.automl._delete_output_directories = lambda: 0 - - def test_only_loads_ensemble_models(self): - identifiers = [(1, 2), (3, 4)] - models = [ 42 ] - self.automl._backend.load_ensemble.return_value.identifiers_ \ - = identifiers - self.automl._backend.load_models_by_identifiers.side_effect \ - = lambda ids: models if ids is identifiers else None - - self.automl._load_models() - - self.assertEqual(models, self.automl.models_) - - def test_loads_all_models_if_no_ensemble(self): - models = [ 42 ] - self.automl._backend.load_ensemble.return_value = None - self.automl._backend.load_all_models.return_value = models - - self.automl._load_models() - - self.assertEqual(models, self.automl.models_) - - def test_raises_if_no_models(self): - self.automl._backend.load_ensemble.return_value = None - self.automl._backend.load_all_models.return_value = [] - - self.assertRaises(ValueError, self.automl._load_models) From e2219f3bba1913348d9fe9187777718b936234a1 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 15 Nov 2016 18:11:39 +0100 Subject: [PATCH 23/38] MAINT remove mock module, use unittest.mock instead --- autosklearn/automl.py | 3 --- requirements.txt | 1 - setup.py | 1 - test/test_automl/test_estimators.py | 14 +++++--------- test/test_automl/test_models.py | 7 ++----- test/test_evaluation/test_evaluation.py | 14 +++++--------- test/test_pipeline/test_classification.py | 21 +++++++++------------ test/test_pipeline/test_regression.py | 9 +++------ test/test_util/test_backend.py | 15 ++++++--------- test/test_util/test_dependencies.py | 3 +-- test/util/test_backend.py | 12 ++++++------ testcommand.sh | 2 +- 12 files changed, 38 insertions(+), 64 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 74d69c82f4..0b5ac6c5a2 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -5,7 +5,6 @@ import hashlib import io import os -import unittest.mock from ConfigSpace.io import pcs @@ -14,8 +13,6 @@ import scipy.stats from sklearn.base import BaseEstimator from smac.tae.execute_ta_run import StatusType -import smac.stats.stats -from smac.runhistory.runhistory import RunHistory from sklearn.grid_search import _CVScoreTuple from autosklearn.constants import * diff --git a/requirements.txt b/requirements.txt index dee4d27145..c5390b1efb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ unittest2 setuptools -mock nose six diff --git a/setup.py b/setup.py index 1618fae12b..cf50e71ad9 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,6 @@ requirements = [ "unittest2", "setuptools", - "mock", "nose", "six", "Cython", diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 2815347ae0..6025c235f8 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -4,14 +4,10 @@ import os import sys import unittest +import unittest.mock import sklearn -try: - import mock -except ImportError: - from unittest import mock - import numpy as np import numpy.ma as npma from sklearn.grid_search import _CVScoreTuple @@ -182,7 +178,7 @@ def test_grid_scores(self): ensemble_size=0) cls_ = cls.build_automl() automl = cls_._automl - automl._proc_smac = mock.MagicMock() + automl._proc_smac = unittest.mock.MagicMock() RunKey = collections.namedtuple( 'RunKey', ['config_id', 'instance_id', 'seed']) @@ -199,7 +195,7 @@ def test_grid_scores(self): # In the runhistory we store losses, thus the score is zero self.assertEqual(grid_scores_[0].mean_validation_score, 0) self.assertEqual(grid_scores_[0].cv_validation_scores, [0]) - self.assertIsInstance(grid_scores_[0].parameters, mock.MagicMock) + self.assertIsInstance(grid_scores_[0].parameters, unittest.mock.MagicMock) del automl self._tearDown(output) @@ -242,7 +238,7 @@ def test_multiclass_prediction(self): predicted_indexes = [2, 1, 0, 1, 2] expected_result = ['c', 'b', 'a', 'b', 'c'] - automl_mock = mock.Mock() + automl_mock = unittest.mock.Mock() automl_mock.predict.return_value = np.array(predicted_probabilities) classifier = AutoMLClassifier(automl_mock) @@ -264,7 +260,7 @@ def test_multilabel_prediction(self): predicted_indexes = [[2, 0], [1, 0], [0, 1], [1, 1], [2, 1]] expected_result = np.array([['c', 13], ['b', 13], ['a', 17], ['b', 17], ['c', 17]], dtype=object) - automl_mock = mock.Mock() + automl_mock = unittest.mock.Mock() automl_mock.predict.return_value = np.matrix(predicted_probabilities) classifier = AutoMLClassifier(automl_mock) diff --git a/test/test_automl/test_models.py b/test/test_automl/test_models.py index 486afd2d54..f91fa3871d 100644 --- a/test/test_automl/test_models.py +++ b/test/test_automl/test_models.py @@ -1,10 +1,7 @@ # -*- encoding: utf-8 -*- from __future__ import print_function import unittest -try: - import mock -except ImportError: - from unittest import mock +import unittest.mock from autosklearn.automl import AutoML from autosklearn.util.backend import Backend @@ -22,7 +19,7 @@ def setUp(self): self.automl = AutoMLStub() self.automl._shared_mode = False self.automl._seed = 42 - self.automl._backend = mock.Mock(spec=Backend) + self.automl._backend = unittest.mock.Mock(spec=Backend) self.automl._delete_output_directories = lambda: 0 def test_only_loads_ensemble_models(self): diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index abf3c8825c..c5f1c1804a 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -4,14 +4,10 @@ import sys import time import unittest +import unittest.mock import numpy as np -if sys.version_info[0] == 2: - import mock -else: - from unittest import mock - this_directory = os.path.dirname(__file__) sys.path.append(this_directory) @@ -67,7 +63,7 @@ def run_over_time(): safe_eval() self.assertEqual(safe_eval.exit_status, pynisher.TimeoutException) - @mock.patch('autosklearn.evaluation.eval_holdout') + @unittest.mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout(self, pynisher_mock): pynisher_mock.side_effect = safe_eval_success_mock ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, @@ -78,7 +74,7 @@ def test_eval_with_limits_holdout(self, pynisher_mock): self.assertEqual(info[1], 0.5) self.assertIsInstance(info[2], float) - @mock.patch('autosklearn.evaluation.eval_holdout') + @unittest.mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): pynisher_mock.return_value = None ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, @@ -89,7 +85,7 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): self.assertEqual(info[1], 2.0) self.assertIsInstance(info[2], float) - @mock.patch('autosklearn.evaluation.eval_holdout') + @unittest.mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): pynisher_mock.side_effect = MemoryError ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, @@ -100,7 +96,7 @@ def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): self.assertEqual(info[1], 2.0) self.assertIsInstance(info[2], float) - @mock.patch('autosklearn.evaluation.eval_holdout') + @unittest.mock.patch('autosklearn.evaluation.eval_holdout') def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): pynisher_mock.side_effect = pynisher.TimeoutException ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 78a564a206..a1f317dd3a 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -4,10 +4,7 @@ import sys import traceback import unittest -try: - import mock -except ImportError: - from unittest import mock +import unittest.mock import numpy as np import sklearn.datasets @@ -398,7 +395,7 @@ def test_predict_batched(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) @@ -412,7 +409,7 @@ def test_predict_batched(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) @@ -447,7 +444,7 @@ def test_predict_batched_sparse(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647,), prediction.shape) @@ -462,7 +459,7 @@ def test_predict_batched_sparse(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) @@ -480,7 +477,7 @@ def test_predict_proba_batched(self): X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline - cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) @@ -495,7 +492,7 @@ def test_predict_proba_batched(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertIsInstance(prediction, np.ndarray) @@ -533,7 +530,7 @@ def test_predict_proba_batched_sparse(self): X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) # The object behind the last step in the pipeline - cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual((1647, 10), prediction.shape) @@ -549,7 +546,7 @@ def test_predict_proba_batched_sparse(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) self.assertEqual(prediction.shape, ((1647, 10))) diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index a6e733c8ba..7271a1b222 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -3,10 +3,7 @@ import sys import traceback import unittest -try: - import mock -except ImportError: - from unittest import mock +import unittest.mock import numpy as np import sklearn.datasets @@ -376,7 +373,7 @@ def test_predict_batched(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((356,), prediction.shape) @@ -394,7 +391,7 @@ def test_predict_batched_sparse(self): cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict(X_test_) - cls_predict = mock.Mock(wraps=cls.pipeline_) + cls_predict = unittest.mock.Mock(wraps=cls.pipeline_) cls.pipeline_ = cls_predict prediction = cls.predict(X_test, batch_size=20) self.assertEqual((356,), prediction.shape) diff --git a/test/test_util/test_backend.py b/test/test_util/test_backend.py index 375d9cc121..28d10cb717 100644 --- a/test/test_util/test_backend.py +++ b/test/test_util/test_backend.py @@ -1,10 +1,7 @@ # -*- encoding: utf-8 -*- from __future__ import print_function import unittest -try: - import mock -except ImportError: - from unittest import mock +import unittest.mock from autosklearn.util.backend import Backend @@ -27,8 +24,8 @@ def setUp(self): self.backend = self.BackendStub() self.backend.get_model_dir = lambda: self.model_directory - @mock.patch('six.moves.cPickle.load') - @mock.patch.object(builtins, 'open') + @unittest.mock.patch('six.moves.cPickle.load') + @unittest.mock.patch.object(builtins, 'open') def test_loads_model_by_seed_and_id(self, openMock, pickleLoadMock): seed = 13 idx = 17 @@ -38,8 +35,8 @@ def test_loads_model_by_seed_and_id(self, openMock, pickleLoadMock): self.assertEqual(expected_model, actual_model) - @mock.patch('six.moves.cPickle.load') - @mock.patch.object(builtins, 'open') + @unittest.mock.patch('six.moves.cPickle.load') + @unittest.mock.patch.object(builtins, 'open') def test_loads_models_by_identifiers(self, openMock, pickleLoadMock): seed = 13 idx = 17 @@ -56,7 +53,7 @@ def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx): file_handler = 'file_handler' expected_model = 'model' - fileMock = mock.MagicMock() + fileMock = unittest.mock.MagicMock() fileMock.__enter__.return_value = file_handler openMock.side_effect = lambda path, flag: fileMock if path == model_path and flag == 'rb' else None diff --git a/test/test_util/test_dependencies.py b/test/test_util/test_dependencies.py index dc561c97c9..6e2584144d 100644 --- a/test/test_util/test_dependencies.py +++ b/test/test_util/test_dependencies.py @@ -1,12 +1,11 @@ import unittest -import warnings import re from unittest.mock import patch, Mock import pkg_resources -from autosklearn.util.dependencies import verify_packages, _verify_package, MissingPackageError, \ +from autosklearn.util.dependencies import verify_packages, MissingPackageError, \ IncorrectPackageVersionError diff --git a/test/util/test_backend.py b/test/util/test_backend.py index 1fc18ca4b3..18011a2900 100644 --- a/test/util/test_backend.py +++ b/test/util/test_backend.py @@ -1,7 +1,7 @@ # -*- encoding: utf-8 -*- from __future__ import print_function import unittest -import mock +import unittest.mock from autosklearn.util.backend import Backend from sys import version_info @@ -23,8 +23,8 @@ def setUp(self): self.backend = self.BackendStub() self.backend.get_model_dir = lambda: self.model_directory - @mock.patch('six.moves.cPickle.load') - @mock.patch.object(builtins, 'open') + @unittest.mock.patch('six.moves.cPickle.load') + @unittest.mock.patch.object(builtins, 'open') def test_loads_model_by_seed_and_id(self, openMock, pickleLoadMock): seed = 13 idx = 17 @@ -34,8 +34,8 @@ def test_loads_model_by_seed_and_id(self, openMock, pickleLoadMock): self.assertEqual(expected_model, actual_model) - @mock.patch('six.moves.cPickle.load') - @mock.patch.object(builtins, 'open') + @unittest.mock.patch('six.moves.cPickle.load') + @unittest.mock.patch.object(builtins, 'open') def test_loads_models_by_identifiers(self, openMock, pickleLoadMock): seed = 13 idx = 17 @@ -52,7 +52,7 @@ def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx): file_handler = 'file_handler' expected_model = 'model' - fileMock = mock.MagicMock() + fileMock = unittest.mock.MagicMock() fileMock.__enter__.return_value = file_handler openMock.side_effect = lambda path, flag: fileMock if path == model_path and flag == 'rb' else None diff --git a/testcommand.sh b/testcommand.sh index 367a087990..d14b13659f 100644 --- a/testcommand.sh +++ b/testcommand.sh @@ -1,2 +1,2 @@ #!/usr/bin/env bash -nosetests --processes=3 --process-timeout=120 -v $1 \ No newline at end of file +nosetests --processes=16 --process-timeout=240 -v $1 \ No newline at end of file From 9274ec308974b0d6702e7f98a0b8a2c3be1cbe11 Mon Sep 17 00:00:00 2001 From: Anatolii Domashnev Date: Fri, 18 Nov 2016 15:18:28 +0100 Subject: [PATCH 24/38] FIX #170 Throw Python34 compatible exception --- autosklearn/util/dependencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/util/dependencies.py b/autosklearn/util/dependencies.py index 61ed76adf5..d36ad7e5c8 100644 --- a/autosklearn/util/dependencies.py +++ b/autosklearn/util/dependencies.py @@ -33,7 +33,7 @@ def _verify_package(name, operation, version): try: module = pkg_resources.get_distribution(name) except pkg_resources.DistributionNotFound: - raise MissingPackageError(name) from None + raise MissingPackageError(name) if not operation: return From 58f409aa229ac1b16bbac3bd8a16a49140224cd8 Mon Sep 17 00:00:00 2001 From: Anatolii Domashnev Date: Fri, 18 Nov 2016 16:22:58 +0100 Subject: [PATCH 25/38] FIX #115 Transpose X if data is not C contiguous --- autosklearn/automl.py | 5 ++++- test/test_automl/test_estimators.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 0b5ac6c5a2..0d663c8c41 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -149,7 +149,10 @@ def fit(self, X, y, if dataset_name is None: m = hashlib.md5() - m.update(X.data) + if X.flags['C_CONTIGUOUS']: + m.update(X.data) + else: + m.update(X.T.data) dataset_name = m.hexdigest() self._backend.save_start_time(self._seed) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 6025c235f8..e4e6a97646 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -227,6 +227,26 @@ def test_cv_results(self): del cls self._tearDown(output) + def test_f_contiguous_array(self): + + output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') + self._setUp(output) + + X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + X_train = np.asfortranarray(X_train) + automl = AutoSklearnClassifier(time_left_for_this_task=15, + per_run_time_limit=5, + tmp_folder=output, + output_folder=output) + automl.fit(X_train, Y_train) + score = automl.score(X_test, Y_test) + print(automl.show_models()) + + self.assertGreaterEqual(score, 0.8) + self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) + + del automl + self._tearDown(output) class AutoMLClassifierTest(unittest.TestCase): From 4f8fae616721ad14dec0e810f876122d07cda51b Mon Sep 17 00:00:00 2001 From: Anatolii Domashnev Date: Fri, 18 Nov 2016 16:52:20 +0100 Subject: [PATCH 26/38] Move hash function into separate file --- autosklearn/automl.py | 8 ++--- autosklearn/util/hash.py | 13 ++++++++ test/test_automl/test_estimators.py | 23 +------------ test/test_util/test_hash.py | 51 +++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 28 deletions(-) create mode 100644 autosklearn/util/hash.py create mode 100644 test/test_util/test_hash.py diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 0d663c8c41..3b9a0f768a 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -25,6 +25,7 @@ pipeline from autosklearn.ensemble_builder import EnsembleBuilder from autosklearn.smbo import AutoMLSMBO +from autosklearn.util.hash import hash_numpy_array class AutoML(BaseEstimator): @@ -148,12 +149,7 @@ def fit(self, X, y, self._backend.context.create_directories() if dataset_name is None: - m = hashlib.md5() - if X.flags['C_CONTIGUOUS']: - m.update(X.data) - else: - m.update(X.T.data) - dataset_name = m.hexdigest() + dataset_name = hash_numpy_array(X) self._backend.save_start_time(self._seed) self._stopwatch = StopWatch() diff --git a/autosklearn/util/hash.py b/autosklearn/util/hash.py new file mode 100644 index 0000000000..fbf635f396 --- /dev/null +++ b/autosklearn/util/hash.py @@ -0,0 +1,13 @@ +import hashlib + + +def hash_numpy_array(X): + m = hashlib.md5() + + if X.flags['C_CONTIGUOUS']: + m.update(X.data) + else: + m.update(X.T.data) + + hash = m.hexdigest() + return hash \ No newline at end of file diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index e4e6a97646..9cfed64965 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -227,28 +227,7 @@ def test_cv_results(self): del cls self._tearDown(output) - def test_f_contiguous_array(self): - - output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') - self._setUp(output) - - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') - X_train = np.asfortranarray(X_train) - automl = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=5, - tmp_folder=output, - output_folder=output) - automl.fit(X_train, Y_train) - score = automl.score(X_test, Y_test) - print(automl.show_models()) - - self.assertGreaterEqual(score, 0.8) - self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) - - del automl - self._tearDown(output) - - + class AutoMLClassifierTest(unittest.TestCase): def test_multiclass_prediction(self): diff --git a/test/test_util/test_hash.py b/test/test_util/test_hash.py new file mode 100644 index 0000000000..fb579b1a30 --- /dev/null +++ b/test/test_util/test_hash.py @@ -0,0 +1,51 @@ +import unittest + +import numpy as np + +from autosklearn.util.hash import hash_numpy_array + + +class HashTests(unittest.TestCase): + + def test_c_contiguous_array(self): + array = np.array([[1, 2], [3, 4]]) + + hash = hash_numpy_array(array) + + self.assertIsNotNone(hash) + + def test_f_contiguous_array(self): + array = np.array([[1, 2], [3, 4]]) + array = np.asfortranarray(array) + + hash = hash_numpy_array(array) + + self.assertIsNotNone(hash) + + def test_transpose_arrays(self): + c_array = np.array([[1, 2], [3, 4]]) + f_array = np.array([[1, 3], [2, 4]]) + f_array = np.asfortranarray(f_array) + + c_hash = hash_numpy_array(c_array) + f_hash = hash_numpy_array(f_array) + + self.assertEqual(c_hash, f_hash) + + def test_same_data_arrays(self): + first_array = np.array([[1, 2], [3, 4]]) + second_array = np.array([[1, 2], [3, 4]]) + + first_hash = hash_numpy_array(first_array) + second_hash = hash_numpy_array(second_array) + + self.assertEqual(first_hash, second_hash) + + def test_different_data_arrays(self): + first_array = np.array([[1, 2], [3, 4]]) + second_array = np.array([[1, 3], [2, 4]]) + + first_hash = hash_numpy_array(first_array) + second_hash = hash_numpy_array(second_array) + + self.assertNotEqual(first_hash, second_hash) \ No newline at end of file From a8c9bc18078d582489a316a853100ac37852b94f Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 18 Nov 2016 17:00:19 +0100 Subject: [PATCH 27/38] FIX issue #187 --- autosklearn/smbo.py | 609 ++++------------------------------ test/test_automl/test_smbo.py | 41 +++ 2 files changed, 112 insertions(+), 538 deletions(-) create mode 100644 test/test_automl/test_smbo.py diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index f06e1a7679..ef1f41c7a1 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -225,7 +225,6 @@ def __init__(self, config_space, dataset_name, self.shared_mode = shared_mode self.runhistory = None - self.config_space.seed(self.seed) logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name if dataset_name is not None else "") @@ -249,178 +248,6 @@ def reset_data_manager(self, max_mem=None): max_mem = max_mem) self.metric = self.datamanager.info['metric'] self.task = self.datamanager.info['task'] - - def collect_additional_subset_defaults(self): - default_configs = [] - # == set default configurations - # first enqueue the default configuration from our config space - if self.datamanager.info["task"] in CLASSIFICATION_TASKS: - config_dict = {'balancing:strategy': 'weighting', - 'classifier:__choice__': 'sgd', - 'classifier:sgd:loss': 'hinge', - 'classifier:sgd:penalty': 'l2', - 'classifier:sgd:alpha': 0.0001, - 'classifier:sgd:fit_intercept': 'True', - 'classifier:sgd:n_iter': 5, - 'classifier:sgd:learning_rate': 'optimal', - 'classifier:sgd:eta0': 0.01, - 'classifier:sgd:average': 'True', - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'no_preprocessing', - 'rescaling:__choice__': 'min/max'} - try: - config = Configuration(self.config_space, config_dict) - default_configs.append(config) - except ValueError as e: - self.logger.warning("Second default configurations %s cannot" - " be evaluated because of %s" % - (config_dict, e)) - - if self.datamanager.info["is_sparse"]: - config_dict = {'classifier:__choice__': 'extra_trees', - 'classifier:extra_trees:bootstrap': 'False', - 'classifier:extra_trees:criterion': 'gini', - 'classifier:extra_trees:max_depth': 'None', - 'classifier:extra_trees:max_features': 1.0, - 'classifier:extra_trees:min_samples_leaf': 5, - 'classifier:extra_trees:min_samples_split': 5, - 'classifier:extra_trees:min_weight_fraction_leaf': 0.0, - 'classifier:extra_trees:n_estimators': 100, - 'balancing:strategy': 'weighting', - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'truncatedSVD', - 'preprocessor:truncatedSVD:target_dim': 20, - 'rescaling:__choice__': 'min/max'} - else: - n_data_points = self.datamanager.data['X_train'].shape[0] - percentile = 20. / n_data_points - percentile = max(percentile, 2.) - - config_dict = {'classifier:__choice__': 'extra_trees', - 'classifier:extra_trees:bootstrap': 'False', - 'classifier:extra_trees:criterion': 'gini', - 'classifier:extra_trees:max_depth': 'None', - 'classifier:extra_trees:max_features': 1.0, - 'classifier:extra_trees:min_samples_leaf': 5, - 'classifier:extra_trees:min_samples_split': 5, - 'classifier:extra_trees:min_weight_fraction_leaf': 0.0, - 'classifier:extra_trees:n_estimators': 100, - 'balancing:strategy': 'weighting', - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'select_percentile_classification', - 'preprocessor:select_percentile_classification:percentile': percentile, - 'preprocessor:select_percentile_classification:score_func': 'chi2', - 'rescaling:__choice__': 'min/max'} - - try: - config = Configuration(self.config_space, config_dict) - default_configs.append(config) - except ValueError as e: - self.logger.warning("Third default configurations %s cannot" - " be evaluated because of %s" % - (config_dict, e)) - - if self.datamanager.info["is_sparse"]: - config_dict = {'balancing:strategy': 'weighting', - 'classifier:__choice__': 'multinomial_nb', - 'classifier:multinomial_nb:alpha': 1.0, - 'classifier:multinomial_nb:fit_prior': 'True', - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'no_preprocessing', - 'rescaling:__choice__': 'none'} - else: - config_dict = {'balancing:strategy': 'weighting', - 'classifier:__choice__': 'gaussian_nb', - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'no_preprocessing', - 'rescaling:__choice__': 'standardize'} - try: - config = Configuration(self.config_space, config_dict) - default_configs.append(config) - except ValueError as e: - self.logger.warning("Forth default configurations %s cannot" - " be evaluated because of %s" % - (config_dict, e)) - - elif self.datamanager.info["task"] in REGRESSION_TASKS: - config_dict = {'regressor:__choice__': 'sgd', - 'regressor:sgd:loss': 'squared_loss', - 'regressor:sgd:penalty': 'l2', - 'regressor:sgd:alpha': 0.0001, - 'regressor:sgd:fit_intercept': 'True', - 'regressor:sgd:n_iter': 5, - 'regressor:sgd:learning_rate': 'optimal', - 'regressor:sgd:eta0': 0.01, - 'regressor:sgd:average': 'True', - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'no_preprocessing', - 'rescaling:__choice__': 'min/max'} - try: - config = Configuration(self.config_space, config_dict) - default_configs.append(config) - except ValueError as e: - self.logger.warning("Second default configurations %s cannot" - " be evaluated because of %s" % - (config_dict, e)) - - if self.datamanager.info["is_sparse"]: - config_dict = {'regressor:__choice__': 'extra_trees', - 'regressor:extra_trees:bootstrap': 'False', - 'regressor:extra_trees:criterion': 'mse', - 'regressor:extra_trees:max_depth': 'None', - 'regressor:extra_trees:max_features': 1.0, - 'regressor:extra_trees:min_samples_leaf': 5, - 'regressor:extra_trees:min_samples_split': 5, - 'regressor:extra_trees:n_estimators': 100, - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'truncatedSVD', - 'preprocessor:truncatedSVD:target_dim': 10, - 'rescaling:__choice__': 'min/max'} - else: - config_dict = {'regressor:__choice__': 'extra_trees', - 'regressor:extra_trees:bootstrap': 'False', - 'regressor:extra_trees:criterion': 'mse', - 'regressor:extra_trees:max_depth': 'None', - 'regressor:extra_trees:max_features': 1.0, - 'regressor:extra_trees:min_samples_leaf': 5, - 'regressor:extra_trees:min_samples_split': 5, - 'regressor:extra_trees:n_estimators': 100, - 'imputation:strategy': 'mean', - 'one_hot_encoding:use_minimum_fraction': 'True', - 'one_hot_encoding:minimum_fraction': 0.1, - 'preprocessor:__choice__': 'pca', - 'preprocessor:pca:keep_variance': 0.9, - 'preprocessor:pca:whiten': 'False', - 'rescaling:__choice__': 'min/max'} - - try: - config = Configuration(self.config_space, config_dict) - default_configs.append(config) - except ValueError as e: - self.logger.warning("Third default configurations %s cannot" - " be evaluated because of %s" % - (config_dict, e)) - - else: - self.logger.info("Tasktype unknown: %s" % - TASK_TYPES_TO_STRING[self.datamanager.info[ - "task"]]) - - return default_configs def collect_metalearning_suggestions(self, meta_base): metalearning_configurations = _get_metalearning_configurations( @@ -510,108 +337,17 @@ def run_smbo(self, max_iters=1000): # == Initialize non-SMBO stuff # first create a scenario - seed = self.seed # TODO + seed = self.seed + self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL - # # == Train on subset - # # before doing anything, let us run the default_cfg - # # on a subset of the available data to ensure that - # # we at least have some models - # # we will try three different ratios of decreasing magnitude - # # in the hope that at least on the last one we will be able - # # to get a model - # n_data = self.datamanager.data['X_train'].shape[0] - # subset_ratio = 10000. / n_data - # if subset_ratio >= 0.5: - # subset_ratio = 0.33 - # subset_ratios = [subset_ratio, subset_ratio * 0.10] - # else: - # subset_ratios = [subset_ratio, 500. / n_data] - # self.logger.info("Training default configurations on a subset of " - # "%d/%d data points." % - # (int(n_data * subset_ratio), n_data)) - # - # # the time limit for these function evaluations is rigorously - # # set to only 1/2 of a full function evaluation - # subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) - # # the configs we want to run on the data subset are: - # # 1) the default configs - # # 2) a set of configs we selected for training on a subset - # subset_configs = [self.config_space.get_default_configuration()] \ - # + self.collect_additional_subset_defaults() - # subset_config_succesful = [False] * len(subset_configs) - # for subset_config_id, next_config in enumerate(subset_configs): - # for i, ratio in enumerate(subset_ratios): - # self.reset_data_manager() - # n_data_subsample = int(n_data * ratio) - # - # # run the config, but throw away the result afterwards - # # since this cfg was evaluated only on a subset - # # and we don't want to confuse SMAC - # self.logger.info("Starting to evaluate %d on SUBSET " - # "with size %d and time limit %ds.", - # num_run, n_data_subsample, - # subset_time_limit) - # self.logger.info(next_config) - # _info = eval_with_limits( - # datamanager=self.datamanager, backend=self.backend, - # config=next_config, seed=seed, num_run=num_run, - # resampling_strategy=self.resampling_strategy, - # resampling_strategy_args=self.resampling_strategy_args, - # memory_limit=self.memory_limit, - # func_eval_time_limit=subset_time_limit, - # subsample=n_data_subsample, - # logger=self.logger) - # (duration, result, _, additional_run_info, status) = _info - # self.logger.info("Finished evaluating %d. configuration on SUBSET. " - # "Duration %f; loss %f; status %s; additional run " - # "info: %s ", num_run, duration, result, - # str(status), additional_run_info) - # - # num_run += 1 - # if i < len(subset_ratios) - 1: - # if status != StatusType.SUCCESS: - # # Do not increase num_run here, because we will try - # # the same configuration with less data - # self.logger.info("A CONFIG did not finish " - # " for subset ratio %f -> going smaller", - # ratio) - # continue - # else: - # self.logger.info("Finished SUBSET training successfully" - # " with ratio %f", ratio) - # subset_config_succesful[subset_config_id] = True - # break - # else: - # if status != StatusType.SUCCESS: - # self.logger.info("A CONFIG did not finish " - # " for subset ratio %f.", - # ratio) - # continue - # else: - # self.logger.info("Finished SUBSET training successfully" - # " with ratio %f", ratio) - # subset_config_succesful[subset_config_id] = True - # break - # - # # Use the first non-failing configuration from the subsets as the new - # # default configuration -> this guards us against the random forest - # # failing on large, sparse datasets - # default_cfg = None - # for subset_config_id, next_config in enumerate(subset_configs): - # if subset_config_succesful[subset_config_id]: - # default_cfg = next_config - # break - # if default_cfg is None: - # default_cfg = self.config_space.get_default_configuration() - # Initialize some SMAC dependencies - run_history = RunHistory(aggregate_func=average_cost) - meta_runhistory = RunHistory(aggregate_func=average_cost) - meta_runs_dataset_indices = {} + runhistory = RunHistory(aggregate_func=average_cost) + # meta_runhistory = RunHistory(aggregate_func=average_cost) + # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again @@ -699,17 +435,17 @@ def run_smbo(self, max_iters=1000): meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) - meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) - meta_runs_index = 0 - try: - meta_durations = meta_base.get_all_runs('runtime') - read_runtime_data = True - except KeyError: - read_runtime_data = False - self.logger.critical('Cannot read runtime data.') - if self.acquisition_function == 'EIPS': - self.logger.critical('Reverting to acquisition function EI!') - self.acquisition_function = 'EI' + #meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) + #meta_runs_index = 0 + #try: + # meta_durations = meta_base.get_all_runs('runtime') + # read_runtime_data = True + #except KeyError: + # read_runtime_data = False + # self.logger.critical('Cannot read runtime data.') + # if self.acquisition_function == 'EIPS': + # self.logger.critical('Reverting to acquisition function EI!') + # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index @@ -747,7 +483,7 @@ def run_smbo(self, max_iters=1000): 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': self.total_walltime_limit, - 'instances': [[name] for name in meta_features_dict], + #'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, 'shared-model': self.shared_mode, 'run-obj': 'quality'}) @@ -764,20 +500,18 @@ def run_smbo(self, max_iters=1000): **self.resampling_strategy_args) types = get_types(self.config_space, self.scenario.feature_array) + + # TODO extract generation of SMAC object into it's own function for + # testing if self.acquisition_function == 'EI': - rh2EPM = RunHistory2EPM4Cost(num_params=num_params, - scenario=self.scenario, - success_states=None, - impute_censored_data=False, - impute_state=None) model = RandomForestWithInstances(types, - instance_features=meta_features_list, + #instance_features=meta_features_list, seed=1, num_trees=10) smac = SMAC(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, - runhistory=run_history) + runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, @@ -791,7 +525,7 @@ def run_smbo(self, max_iters=1000): smac = SMAC(scenario=self.scenario, tae_runner=ta, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed, - runhistory=run_history) + runhistory=runhistory) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) @@ -827,12 +561,6 @@ def run_smbo(self, max_iters=1000): smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() - runkey = list(run_history.data.keys())[-1] - runvalue = run_history.data[runkey] - # self.logger.info("Finished evaluating %d. configuration. " - # "Duration %f; loss %f; status %s; additional run " - # "info: %s ", num_run, runvalue.time, runvalue.cost, - # str(runvalue.status), runvalue.additional_info) for challenger in metalearning_configurations: @@ -844,130 +572,33 @@ def run_smbo(self, max_iters=1000): time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: - pSMAC.write(run_history=run_history, + pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) - runkey = list(run_history.data.keys())[-1] - runvalue = run_history.data[runkey] - # self.logger.info("Finished evaluating %d. configuration. " - # "Duration %f; loss %f; status %s; additional run " - # "info: %s ", num_run, runvalue.time, runvalue.cost, - # str(runvalue.status), runvalue.additional_info) - if smac.solver.stats.is_budget_exhausted(): break - # TODO print SMAC stats - - # finished = False - # for i, next_config in enumerate(([default_cfg] + - # metalearning_configurations)): - # # Do not evaluate default configurations more than once - # if i >= len([default_cfg]) and next_config in [default_cfg]: - # continue - # - # config_name = 'meta-learning' if i >= len([default_cfg]) \ - # else 'default' - # - # self.logger.info("Starting to evaluate %d. configuration " - # "(%s configuration) with time limit %ds.", - # num_run, config_name, self.func_eval_time_limit) - # self.logger.info(next_config) - # self.reset_data_manager() - # info = eval_with_limits(datamanager=self.datamanager, - # backend=self.backend, - # config=next_config, - # seed=seed, num_run=num_run, - # resampling_strategy=self.resampling_strategy, - # resampling_strategy_args=self.resampling_strategy_args, - # memory_limit=self.memory_limit, - # func_eval_time_limit=self.func_eval_time_limit, - # logger=self.logger) - # (duration, result, _, additional_run_info, status) = info - # run_history.add(config=next_config, cost=result, - # time=duration, status=status, - # instance_id=instance_id, seed=seed, - # additional_info=additional_run_info) - # run_history.update_cost(next_config, result) - # self.logger.info("Finished evaluating %d. configuration. " - # "Duration %f; loss %f; status %s; additional run " - # "info: %s ", num_run, duration, result, - # str(status), additional_run_info) - # num_run += 1 - # if smac.incumbent is None: - # smac.incumbent = next_config - # elif result < run_history.get_cost(smac.incumbent): - # smac.incumbent = next_config - # - # if self.scenario.shared_model: - # pSMAC.write(run_history=run_history, - # output_directory=self.scenario.output_dir, - # num_run=self.seed) - # - # if self.watcher.wall_elapsed( - # 'SMBO') > self.total_walltime_limit: - # finished = True - # - # if finished: - # break - # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: - pSMAC.read(run_history=run_history, + pSMAC.read(run_history=smac.solver.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) - challengers = [] choose_next_start_time = time.time() try: - X_cfg, Y_cfg = rh2EPM.transform(run_history) - - if not run_history.empty(): - # Update costs by normalization - dataset_minimum = np.min(Y_cfg[:, 0]) - Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / - (1. - dataset_minimum)) - Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 - - #if len(X_meta) > 0 and len(X_cfg) > 0: - # pass - # X_cfg = np.concatenate((X_meta, X_cfg)) - # Y_cfg = np.concatenate((Y_meta, Y_cfg)) - #elif len(X_meta) > 0: - # X_cfg = X_meta.copy() - # Y_cfg = Y_meta.copy() - #elif len(X_cfg) > 0: - X_cfg = X_cfg.copy() - Y_cfg = Y_cfg.copy() - #else: - # raise ValueError('No training data for SMAC random forest!') - - self.logger.info('Using %d training points for SMAC.' % - X_cfg.shape[0]) - choose_next_start_time = time.time() - next_configs_tmp = smac.solver.choose_next( - X_cfg, Y_cfg, num_interleaved_random=110, - num_configurations_by_local_search=10, - num_configurations_by_random_search_sorted=100) - time_for_choose_next = time.time() - choose_next_start_time - self.logger.info('Used %g seconds to find next ' - 'configurations' % (time_for_choose_next)) - challengers.extend(next_configs_tmp) + challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() - challengers.append(next_config) + challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time - - # self.logger.info("Finished evaluating %d. configuration. " - # "Duration: %f; loss: %f; status %s; additional " - # "run info: %s ", num_run, duration, result, - # str(status), additional_run_info) + self.logger.info('Used %g seconds to find next ' + 'configurations' % (time_for_choose_next)) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, @@ -977,149 +608,51 @@ def run_smbo(self, max_iters=1000): time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: - pSMAC.write(run_history=run_history, + pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) - # runkey = list(run_history.data.keys())[-1] - # runvalue = run_history.data[runkey] - # self.logger.info("Finished evaluating %d. configuration. " - # "Duration %f; loss %f; status %s; additional run " - # "info: %s ", num_run, runvalue.time, runvalue.cost, - # str(runvalue.status), runvalue.additional_info) - if smac.solver.stats.is_budget_exhausted(): break - self.runhistory = run_history - - # smac.runhistory = run_history - # smac_iter = 0 - # while not finished: - # if self.scenario.shared_model: - # pSMAC.read(run_history=run_history, - # output_directory=self.scenario.output_dir, - # configuration_space=self.config_space, - # logger=self.logger) - # - # next_configs = [] - # time_for_choose_next = -1 - # try: - # X_cfg, Y_cfg = rh2EPM.transform(run_history) - # - # if not run_history.empty(): - # # Update costs by normalization - # dataset_minimum = np.min(Y_cfg[:, 0]) - # Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / - # (1. - dataset_minimum)) - # Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 - # - # if len(X_meta) > 0 and len(X_cfg) > 0: - # pass - # #X_cfg = np.concatenate((X_meta, X_cfg)) - # #Y_cfg = np.concatenate((Y_meta, Y_cfg)) - # elif len(X_meta) > 0: - # X_cfg = X_meta.copy() - # Y_cfg = Y_meta.copy() - # elif len(X_cfg) > 0: - # X_cfg = X_cfg.copy() - # Y_cfg = Y_cfg.copy() - # else: - # raise ValueError('No training data for SMAC random forest!') - # - # self.logger.info('Using %d training points for SMAC.' % - # X_cfg.shape[0]) - # choose_next_start_time = time.time() - # next_configs_tmp = smac.solver.choose_next( - # X_cfg, Y_cfg, num_interleaved_random=110, - # num_configurations_by_local_search=10, - # num_configurations_by_random_search_sorted=100) - # time_for_choose_next = time.time() - choose_next_start_time - # self.logger.info('Used %g seconds to find next ' - # 'configurations' % (time_for_choose_next)) - # next_configs.extend(next_configs_tmp) - # except Exception as e: - # self.logger.error(e) - # self.logger.error("Error in getting next configurations " - # "with SMAC. Using random configuration!") - # next_config = self.config_space.sample_configuration() - # next_configs.append(next_config) - # - # models_fitted_this_iteration = 0 - # start_time_this_iteration = time.time() - # for next_config in next_configs: - # #x_runtime = impute_inactive_values(next_config) - # #x_runtime = impute_inactive_values(x_runtime).get_array() - # # predicted_runtime = runtime_rf.predict_marginalized_over_instances( - # # x_runtime.reshape((1, -1))) - # # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 - # - # self.logger.info("Starting to evaluate %d. configuration (from " - # "SMAC) with time limit %ds.", num_run, - # self.func_eval_time_limit) - # self.logger.info(next_config) - # self.reset_data_manager() - # info = eval_with_limits(datamanager=self.datamanager, - # backend=self.backend, - # config=next_config, - # seed=seed, num_run=num_run, - # resampling_strategy=self.resampling_strategy, - # resampling_strategy_args=self.resampling_strategy_args, - # memory_limit=self.memory_limit, - # func_eval_time_limit=self.func_eval_time_limit, - # logger=self.logger) - # (duration, result, _, additional_run_info, status) = info - # run_history.add(config=next_config, cost=result, - # time=duration, status=status, - # instance_id=instance_id, seed=seed, - # additional_info=additional_run_info) - # run_history.update_cost(next_config, result) - # - # #self.logger.info('Predicted runtime %g, true runtime %g', - # # predicted_runtime, duration) - # - # # TODO add unittest to make sure everything works fine and - # # this does not get outdated! - # if smac.incumbent is None: - # smac.incumbent = next_config - # elif result < run_history.get_cost(smac.incumbent): - # smac.incumbent = next_config - # - # self.logger.info("Finished evaluating %d. configuration. " - # "Duration: %f; loss: %f; status %s; additional " - # "run info: %s ", num_run, duration, result, - # str(status), additional_run_info) - # smac_iter += 1 - # num_run += 1 - # - # models_fitted_this_iteration += 1 - # time_used_this_iteration = time.time() - start_time_this_iteration - # - # if max_iters is not None: - # finished = (smac_iter >= max_iters) - # - # if self.watcher.wall_elapsed( - # 'SMBO') > self.total_walltime_limit: - # finished = True - # - # if models_fitted_this_iteration >= 2 and \ - # time_for_choose_next > 0 and \ - # time_used_this_iteration > time_for_choose_next: - # break - # elif time_for_choose_next <= 0 and \ - # models_fitted_this_iteration >= 1: - # break - # elif models_fitted_this_iteration >= 50: - # break - # - # if finished: - # break - # - # if self.scenario.shared_model: - # pSMAC.write(run_history=run_history, - # output_directory=self.scenario.output_dir, - # num_run=self.seed) - # - # self.runhistory = run_history - # - + self.runhistory = smac.solver.runhistory + + def choose_next(self, smac): + challengers = [] + + if len(smac.solver.runhistory.data) == 0: + raise ValueError('Cannot use SMBO algorithm on empty runhistory.') + + X_cfg, Y_cfg = smac.solver.rh2EPM.transform(smac.solver.runhistory) + + if not smac.solver.runhistory.empty(): + # Update costs by normalization + dataset_minimum = np.min(Y_cfg[:, 0]) + Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / + (1. - dataset_minimum)) + Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 + + # if len(X_meta) > 0 and len(X_cfg) > 0: + # pass + # X_cfg = np.concatenate((X_meta, X_cfg)) + # Y_cfg = np.concatenate((Y_meta, Y_cfg)) + # elif len(X_meta) > 0: + # X_cfg = X_meta.copy() + # Y_cfg = Y_meta.copy() + # elif len(X_cfg) > 0: + X_cfg = X_cfg.copy() + Y_cfg = Y_cfg.copy() + # else: + # raise ValueError('No training data for SMAC random forest!') + + self.logger.info('Using %d training points for SMAC.' % + X_cfg.shape[0]) + next_configs_tmp = smac.solver.choose_next( + X_cfg, Y_cfg, num_interleaved_random=110, + num_configurations_by_local_search=10, + num_configurations_by_random_search_sorted=100) + + challengers.extend(next_configs_tmp) + + return challengers + diff --git a/test/test_automl/test_smbo.py b/test/test_automl/test_smbo.py new file mode 100644 index 0000000000..f1a24627d2 --- /dev/null +++ b/test/test_automl/test_smbo.py @@ -0,0 +1,41 @@ +import unittest + +from autosklearn.smbo import AutoMLSMBO +from smac.facade.smac_facade import SMAC +from smac.scenario.scenario import Scenario +from smac.tae.execute_ta_run import StatusType +from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter, Configuration + + +class TestSMBO(unittest.TestCase): + + def test_choose_next(self): + configspace = ConfigurationSpace() + configspace.add_hyperparameter(UniformFloatHyperparameter('a', 0, 1)) + configspace.add_hyperparameter(UniformFloatHyperparameter('b', 0, 1)) + + dataset_name = 'foo' + func_eval_time_limit = 15 + total_walltime_limit = 15 + memory_limit = 3000 + + auto = AutoMLSMBO(None, dataset_name, None, func_eval_time_limit, + total_walltime_limit, memory_limit, None) + auto.config_space = configspace + scenario = Scenario({'cs': configspace, + 'cutoff-time': func_eval_time_limit, + 'wallclock-limit': total_walltime_limit, + 'memory-limit': memory_limit, + 'run-obj': 'quality'}) + smac = SMAC(scenario) + + self.assertRaisesRegex(ValueError, 'Cannot use SMBO algorithm on ' + 'empty runhistory', + auto.choose_next, smac) + + runhistory = smac.solver.runhistory + runhistory.add(config=Configuration(configspace, + values={'a': 0.1, 'b': 0.2}), + cost=0.5, time=0.5, status=StatusType.SUCCESS) + + auto.choose_next(smac) \ No newline at end of file From 3e186d57573bfa1b871ea53cbc5c50da2c1fb1d5 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 18 Nov 2016 17:13:59 +0100 Subject: [PATCH 28/38] MAINT reorganize unit tests --- example/example_holdout.py | 2 +- test/test_automl/test_automl.py | 176 ++++++++++++++++++++++++++ test/test_automl/test_estimators.py | 47 +++++++ test/test_automl/test_models.py | 50 -------- test/test_automl/test_pickle.py | 58 --------- test/test_automl/test_start_automl.py | 150 ---------------------- 6 files changed, 224 insertions(+), 259 deletions(-) delete mode 100644 test/test_automl/test_models.py delete mode 100644 test/test_automl/test_pickle.py delete mode 100644 test/test_automl/test_start_automl.py diff --git a/example/example_holdout.py b/example/example_holdout.py index 8c057a4f37..cdbd9b450f 100644 --- a/example/example_holdout.py +++ b/example/example_holdout.py @@ -31,7 +31,7 @@ def main(): automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, tmp_folder='/tmp/autoslearn_holdout_example_tmp', - output_folder='/tmp/autosklearn_holdout_example_out') + output_folder='/tmp/autosklearn_holdout_example_out', automl.fit(X_train, y_train, dataset_name='digits') # Print the best models together with their scores - if all scores are diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index a405d67e5d..8d938c7905 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -2,21 +2,42 @@ import multiprocessing import os import sys +import time import unittest import unittest.mock import numpy as np +import sklearn.datasets +import six from autosklearn.util.backend import Backend, BackendContext from autosklearn.automl import AutoML +import autosklearn.automl +import autosklearn.pipeline.util as putil +from autosklearn.util import setup_logger, get_logger, backend +from autosklearn.constants import * +from autosklearn.smbo import load_data sys.path.append(os.path.dirname(__file__)) from base import Base +class AutoMLStub(AutoML): + def __init__(self): + self.__class__ = AutoML + + class AutoMLTest(Base, unittest.TestCase): _multiprocess_can_split_ = True + def setUp(self): + self.automl = AutoMLStub() + + self.automl._shared_mode = False + self.automl._seed = 42 + self.automl._backend = unittest.mock.Mock(spec=Backend) + self.automl._delete_output_directories = lambda: 0 + def test_refit_shuffle_on_fail(self): output = os.path.join(self.test_dir, '..', '.tmp_refit_shuffle_on_fail') context = BackendContext(output, output, False, False) @@ -38,4 +59,159 @@ def test_refit_shuffle_on_fail(self): self.assertEqual(failing_model.fit.call_count, 3) + def test_only_loads_ensemble_models(self): + identifiers = [(1, 2), (3, 4)] + + models = [42] + self.automl._backend.load_ensemble.return_value.identifiers_ \ + = identifiers + self.automl._backend.load_models_by_identifiers.side_effect \ + = lambda ids: models if ids is identifiers else None + + self.automl._load_models() + + self.assertEqual(models, self.automl.models_) + + def test_loads_all_models_if_no_ensemble(self): + models = [42] + self.automl._backend.load_ensemble.return_value = None + self.automl._backend.load_all_models.return_value = models + + self.automl._load_models() + + self.assertEqual(models, self.automl.models_) + + def test_raises_if_no_models(self): + self.automl._backend.load_ensemble.return_value = None + self.automl._backend.load_all_models.return_value = [] + + self.assertRaises(ValueError, self.automl._load_models) + + def test_fit(self): + output = os.path.join(self.test_dir, '..', '.tmp_test_fit') + self._setUp(output) + + X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + backend_api = backend.create(output, output) + automl = autosklearn.automl.AutoML(backend_api, 15, 5) + automl.fit(X_train, Y_train) + score = automl.score(X_test, Y_test) + self.assertGreaterEqual(score, 0.8) + self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) + + del automl + self._tearDown(output) + + def test_binary_score(self): + """ + Test fix for binary classification prediction + taking the index 1 of second dimension in prediction matrix + """ + + output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') + self._setUp(output) + + data = sklearn.datasets.make_classification( + n_samples=1000, n_features=20, n_redundant=5, n_informative=5, + n_repeated=2, n_clusters_per_class=2, random_state=1) + X_train = data[0][:700] + Y_train = data[1][:700] + X_test = data[0][700:] + Y_test = data[1][700:] + + backend_api = backend.create(output, output) + automl = autosklearn.automl.AutoML(backend_api, 15, 5) + automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) + self.assertEqual(automl._task, BINARY_CLASSIFICATION) + + score = automl.score(X_test, Y_test) + self.assertGreaterEqual(score, 0.5) + + del automl + self._tearDown(output) + + def test_automl_outputs(self): + output = os.path.join(self.test_dir, '..', + '.tmp_test_automl_outputs') + self._setUp(output) + name = '31_bac' + dataset = os.path.join(self.test_dir, '..', '.data', name) + data_manager_file = os.path.join(output, '.auto-sklearn', + 'datamanager.pkl') + + backend_api = backend.create(output, output) + auto = autosklearn.automl.AutoML( + backend_api, 15, 5, + initial_configurations_via_metalearning=25, + seed=100) + auto.fit_automl_dataset(dataset) + + # pickled data manager (without one hot encoding!) + with open(data_manager_file, 'rb') as fh: + D = six.moves.cPickle.load(fh) + self.assertTrue(np.allclose(D.data['X_train'][0, :3], + [1., 12., 2.])) + + # Check that all directories are there + fixture = ['predictions_valid', 'true_targets_ensemble.npy', + 'start_time_100', 'datamanager.pkl', 'predictions_ensemble', + 'ensembles', 'predictions_test', 'models'] + self.assertEqual(sorted(os.listdir(os.path.join(output, + '.auto-sklearn'))), + sorted(fixture)) + + # At least one ensemble, one validation, one test prediction and one + # model and one ensemble + fixture = os.listdir(os.path.join(output, '.auto-sklearn', + 'predictions_ensemble')) + self.assertIn('predictions_ensemble_100_00001.npy', fixture) + + fixture = os.listdir(os.path.join(output, '.auto-sklearn', + 'models')) + self.assertIn('100.1.model', fixture) + + fixture = os.listdir(os.path.join(output, '.auto-sklearn', + 'ensembles')) + self.assertIn('100.0000000000.ensemble', fixture) + + # Start time + start_time_file_path = os.path.join(output, '.auto-sklearn', + "start_time_100") + with open(start_time_file_path, 'r') as fh: + start_time = float(fh.read()) + self.assertGreaterEqual(time.time() - start_time, 10) + + del auto + self._tearDown(output) + + def test_do_dummy_prediction(self): + for name in ['401_bac', '31_bac', 'adult', 'cadata']: + output = os.path.join(self.test_dir, '..', + '.tmp_test_do_dummy_prediction') + self._setUp(output) + + dataset = os.path.join(self.test_dir, '..', '.data', name) + + backend_api = backend.create(output, output) + auto = autosklearn.automl.AutoML( + backend_api, 15, 5, + initial_configurations_via_metalearning=25) + setup_logger() + auto._logger = get_logger('test_do_dummy_predictions') + auto._backend._make_internals_directory() + D = load_data(dataset, backend_api) + auto._backend.save_datamanager(D) + auto._do_dummy_prediction(D, 1) + + # Ensure that the dummy predictions are not in the current working + # directory, but in the output directory (under output) + self.assertFalse(os.path.exists(os.path.join(os.getcwd(), + '.auto-sklearn'))) + self.assertTrue(os.path.exists(os.path.join( + output, '.auto-sklearn', 'predictions_ensemble', + 'predictions_ensemble_1_00001.npy'))) + + del auto + self._tearDown(output) + diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 9cfed64965..c7d17b5f6c 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -2,6 +2,7 @@ from __future__ import print_function import collections import os +import pickle import sys import unittest import unittest.mock @@ -270,3 +271,49 @@ def test_multilabel_prediction(self): actual_result = classifier.predict([None] * len(predicted_indexes)) np.testing.assert_array_equal(expected_result, actual_result) + + def test_can_pickle_classifier(self): + output = os.path.join(self.test_dir, '..', '.tmp_can_pickle') + self._setUp(output) + + X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + automl = AutoSklearnClassifier(time_left_for_this_task=15, + per_run_time_limit=5, + tmp_folder=output, + output_folder=output) + automl.fit(X_train, Y_train) + + initial_predictions = automl.predict(X_test) + initial_accuracy = sklearn.metrics.accuracy_score(Y_test, + initial_predictions) + self.assertTrue(initial_accuracy > 0.75) + + # Test pickle + dump_file = os.path.join(output, 'automl.dump.pkl') + + with open(dump_file, 'wb') as f: + pickle.dump(automl, f) + + with open(dump_file, 'rb') as f: + restored_automl = pickle.load(f) + + restored_predictions = restored_automl.predict(X_test) + restored_accuracy = sklearn.metrics.accuracy_score(Y_test, + restored_predictions) + self.assertTrue(restored_accuracy > 0.75) + + self.assertEqual(initial_accuracy, restored_accuracy) + + # Test joblib + dump_file = os.path.join(output, 'automl.dump.joblib') + + sklearn.externals.joblib.dump(automl, dump_file) + + restored_automl = sklearn.externals.joblib.load(dump_file) + + restored_predictions = restored_automl.predict(X_test) + restored_accuracy = sklearn.metrics.accuracy_score(Y_test, + restored_predictions) + self.assertTrue(restored_accuracy > 0.75) + + self.assertEqual(initial_accuracy, restored_accuracy) diff --git a/test/test_automl/test_models.py b/test/test_automl/test_models.py deleted file mode 100644 index f91fa3871d..0000000000 --- a/test/test_automl/test_models.py +++ /dev/null @@ -1,50 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function -import unittest -import unittest.mock - -from autosklearn.automl import AutoML -from autosklearn.util.backend import Backend - - -class AutoMLStub(AutoML): - - def __init__(self): - self.__class__ = AutoML - - -class AutoMlModelsTest(unittest.TestCase): - - def setUp(self): - self.automl = AutoMLStub() - self.automl._shared_mode = False - self.automl._seed = 42 - self.automl._backend = unittest.mock.Mock(spec=Backend) - self.automl._delete_output_directories = lambda: 0 - - def test_only_loads_ensemble_models(self): - identifiers = [(1, 2), (3, 4)] - models = [ 42 ] - self.automl._backend.load_ensemble.return_value.identifiers_ \ - = identifiers - self.automl._backend.load_models_by_identifiers.side_effect \ - = lambda ids: models if ids is identifiers else None - - self.automl._load_models() - - self.assertEqual(models, self.automl.models_) - - def test_loads_all_models_if_no_ensemble(self): - models = [ 42 ] - self.automl._backend.load_ensemble.return_value = None - self.automl._backend.load_all_models.return_value = models - - self.automl._load_models() - - self.assertEqual(models, self.automl.models_) - - def test_raises_if_no_models(self): - self.automl._backend.load_ensemble.return_value = None - self.automl._backend.load_all_models.return_value = [] - - self.assertRaises(ValueError, self.automl._load_models) diff --git a/test/test_automl/test_pickle.py b/test/test_automl/test_pickle.py deleted file mode 100644 index 6db44f5a15..0000000000 --- a/test/test_automl/test_pickle.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import print_function -from autosklearn.classification import AutoSklearnClassifier -import autosklearn.pipeline.util as putil -import unittest -import six.moves.cPickle as pickle -import os -import sklearn.datasets -import sklearn.metrics -import sklearn.externals.joblib -from base import Base - - -class PicklingTests(Base, unittest.TestCase): - - def test_can_pickle_classifier(self): - - output = os.path.join(self.test_dir, '..', '.tmp_can_pickle') - self._setUp(output) - - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') - automl = AutoSklearnClassifier(time_left_for_this_task=15, - per_run_time_limit=5, - tmp_folder=output, - output_folder=output) - automl.fit(X_train, Y_train) - - initial_predictions = automl.predict(X_test) - initial_accuracy = sklearn.metrics.accuracy_score(Y_test, initial_predictions) - self.assertTrue(initial_accuracy > 0.75) - - # Test pickle - dump_file = os.path.join(output, 'automl.dump.pkl') - - with open(dump_file, 'wb') as f: - pickle.dump(automl, f) - - with open(dump_file, 'rb') as f: - restored_automl = pickle.load(f) - - restored_predictions = restored_automl.predict(X_test) - restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) - self.assertTrue(restored_accuracy > 0.75) - - self.assertEqual(initial_accuracy, restored_accuracy) - - # Test joblib - dump_file = os.path.join(output, 'automl.dump.joblib') - - sklearn.externals.joblib.dump(automl, dump_file) - - restored_automl = sklearn.externals.joblib.load(dump_file) - - restored_predictions = restored_automl.predict(X_test) - restored_accuracy = sklearn.metrics.accuracy_score(Y_test, restored_predictions) - self.assertTrue(restored_accuracy > 0.75) - - self.assertEqual(initial_accuracy, restored_accuracy) - diff --git a/test/test_automl/test_start_automl.py b/test/test_automl/test_start_automl.py deleted file mode 100644 index 722a5f710c..0000000000 --- a/test/test_automl/test_start_automl.py +++ /dev/null @@ -1,150 +0,0 @@ -# -*- encoding: utf-8 -*- -import multiprocessing -import os -import sys -import time -import unittest - -import numpy as np -import six -import sklearn.datasets - -import autosklearn.automl -import autosklearn.pipeline.util as putil -from autosklearn.util import setup_logger, get_logger, backend -from autosklearn.constants import * -from autosklearn.smbo import load_data - -sys.path.append(os.path.dirname(__file__)) -from base import Base - -class AutoMLTest(Base, unittest.TestCase): - _multiprocess_can_split_ = True - - def test_fit(self): - - output = os.path.join(self.test_dir, '..', '.tmp_test_fit') - self._setUp(output) - - X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') - backend_api = backend.create(output, output) - automl = autosklearn.automl.AutoML(backend_api, 15, 5) - automl.fit(X_train, Y_train) - score = automl.score(X_test, Y_test) - self.assertGreaterEqual(score, 0.8) - self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) - - del automl - self._tearDown(output) - - def test_binary_score(self): - """ - Test fix for binary classification prediction - taking the index 1 of second dimension in prediction matrix - """ - - output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') - self._setUp(output) - - data = sklearn.datasets.make_classification( - n_samples=1000, n_features=20, n_redundant=5, n_informative=5, - n_repeated=2, n_clusters_per_class=2, random_state=1) - X_train = data[0][:700] - Y_train = data[1][:700] - X_test = data[0][700:] - Y_test = data[1][700:] - - backend_api = backend.create(output, output) - automl = autosklearn.automl.AutoML(backend_api, 15, 5) - automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) - self.assertEqual(automl._task, BINARY_CLASSIFICATION) - - score = automl.score(X_test, Y_test) - self.assertGreaterEqual(score, 0.5) - - del automl - self._tearDown(output) - - def test_automl_outputs(self): - output = os.path.join(self.test_dir, '..', - '.tmp_test_automl_outputs') - self._setUp(output) - name = '31_bac' - dataset = os.path.join(self.test_dir, '..', '.data', name) - data_manager_file = os.path.join(output, '.auto-sklearn', - 'datamanager.pkl') - - backend_api = backend.create(output, output) - auto = autosklearn.automl.AutoML( - backend_api, 15, 5, - initial_configurations_via_metalearning=25, - seed=100) - auto.fit_automl_dataset(dataset) - - # pickled data manager (without one hot encoding!) - with open(data_manager_file, 'rb') as fh: - D = six.moves.cPickle.load(fh) - self.assertTrue(np.allclose(D.data['X_train'][0, :3], - [1., 12., 2.])) - - # Check that all directories are there - fixture = ['predictions_valid', 'true_targets_ensemble.npy', - 'start_time_100', 'datamanager.pkl', 'predictions_ensemble', - 'ensembles', 'predictions_test', 'models'] - self.assertEqual(sorted(os.listdir(os.path.join(output, - '.auto-sklearn'))), - sorted(fixture)) - - # At least one ensemble, one validation, one test prediction and one - # model and one ensemble - fixture = os.listdir(os.path.join(output, '.auto-sklearn', - 'predictions_ensemble')) - self.assertIn('predictions_ensemble_100_00001.npy', fixture) - - fixture = os.listdir(os.path.join(output, '.auto-sklearn', - 'models')) - self.assertIn('100.1.model', fixture) - - fixture = os.listdir(os.path.join(output, '.auto-sklearn', - 'ensembles')) - self.assertIn('100.0000000000.ensemble', fixture) - - # Start time - start_time_file_path = os.path.join(output, '.auto-sklearn', - "start_time_100") - with open(start_time_file_path, 'r') as fh: - start_time = float(fh.read()) - self.assertGreaterEqual(time.time() - start_time, 10) - - del auto - self._tearDown(output) - - def test_do_dummy_prediction(self): - for name in ['401_bac', '31_bac', 'adult', 'cadata']: - output = os.path.join(self.test_dir, '..', - '.tmp_test_do_dummy_prediction') - self._setUp(output) - - dataset = os.path.join(self.test_dir, '..', '.data', name) - - backend_api = backend.create(output, output) - auto = autosklearn.automl.AutoML( - backend_api, 15, 5, - initial_configurations_via_metalearning=25) - setup_logger() - auto._logger = get_logger('test_do_dummy_predictions') - auto._backend._make_internals_directory() - D = load_data(dataset, backend_api) - auto._backend.save_datamanager(D) - auto._do_dummy_prediction(D, 1) - - # Ensure that the dummy predictions are not in the current working - # directory, but in the output directory (under output) - self.assertFalse(os.path.exists(os.path.join(os.getcwd(), - '.auto-sklearn'))) - self.assertTrue(os.path.exists(os.path.join( - output, '.auto-sklearn', 'predictions_ensemble', - 'predictions_ensemble_1_00001.npy'))) - - del auto - self._tearDown(output) From 4d44ae6dcc9ec09c0f8bedf35707a8c7e522a973 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 18 Nov 2016 17:56:44 +0100 Subject: [PATCH 29/38] FIX unittests --- test/test_automl/test_automl.py | 2 ++ test/test_automl/test_estimators.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index 8d938c7905..eb55af4185 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -31,6 +31,8 @@ class AutoMLTest(Base, unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): + super().setUp() + self.automl = AutoMLStub() self.automl._shared_mode = False diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index c7d17b5f6c..df2cf1a721 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -229,7 +229,7 @@ def test_cv_results(self): self._tearDown(output) -class AutoMLClassifierTest(unittest.TestCase): +class AutoMLClassifierTest(Base, unittest.TestCase): def test_multiclass_prediction(self): classes = [['a', 'b', 'c']] From 752a55429300951352483cf820d68881ae236491 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Tue, 22 Nov 2016 13:39:06 +0530 Subject: [PATCH 30/38] Replace spaces with tabs in Makefile --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 3b82ed69d7..a1d926990f 100644 --- a/Makefile +++ b/Makefile @@ -16,12 +16,12 @@ inplace: $(PYTHON) setup.py build_ext -i doc: - cd ./doc - make html - cd .. + cd ./doc + make html + cd .. test-code: in - $(NOSETESTS) -s -v tests + $(NOSETESTS) -s -v tests test-doc: $(NOSETESTS) -s -v doc/*.rst From 8c79f0189de29479db8abc3b3415a7842102ac37 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Tue, 22 Nov 2016 13:52:22 +0530 Subject: [PATCH 31/38] The test directory is called "test", not tests --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a1d926990f..755e61f3f8 100644 --- a/Makefile +++ b/Makefile @@ -21,12 +21,12 @@ doc: cd .. test-code: in - $(NOSETESTS) -s -v tests + $(NOSETESTS) -s -v test test-doc: $(NOSETESTS) -s -v doc/*.rst test-coverage: rm -rf coverage .coverage - $(NOSETESTS) -s -v --with-coverage tests + $(NOSETESTS) -s -v --with-coverage test test: test-code test-sphinxext test-doc From b30b5b75216435acae3cfa5cc574f9e75280633d Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 23 Nov 2016 09:28:59 +0100 Subject: [PATCH 32/38] CI fix conda installation on OSX --- .travis.yml | 8 ++++---- ci_scripts/install.sh | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9db11785ef..935ecaa9bc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,9 +8,9 @@ matrix: include: - os: linux - env: DISTRIB="conda" PYTHON_VERSION="3.4" + env: DISTRIB="conda" PYTHON_VERSION="3.4" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" - os: linux - env: DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" + env: DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" # Set language to generic to not break travis-ci # https://github.com/travis-ci/travis-ci/issues/2312#issuecomment-195620855 @@ -19,11 +19,11 @@ matrix: - os: osx sudo: required language: generic - env: DISTRIB="conda" PYTHON_VERSION="3.4" + env: DISTRIB="conda" PYTHON_VERSION="3.4" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh" - os: osx sudo: required language: generic - env: DISTRIB="conda" PYTHON_VERSION="3.5" + env: DISTRIB="conda" PYTHON_VERSION="3.5" MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh" cache: # We use three different cache directory diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh index 8a76a0914c..14252b8683 100644 --- a/ci_scripts/install.sh +++ b/ci_scripts/install.sh @@ -13,12 +13,12 @@ ls -l echo if [[ ! -f miniconda.sh ]] then - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh \ - -O miniconda.sh + wget $MINICONDA_URL -O miniconda.sh fi chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda cd .. export PATH=/home/travis/miniconda/bin:$PATH +if [[ `which conda` ]]; then echo 'Conda installation successful'; else exit 1; fi conda update --yes conda popd @@ -30,6 +30,8 @@ source activate testenv # Install anaconda gcc compiler to have compiler compatible with the # anaconda python executable conda install gcc --yes +echo "Using GCC at "`which gcc` +export CC=`which gcc` # Install requirements in correct order cat requirements.txt | xargs -n 1 -L 1 pip install From e74f84e41b422e6990dca1bdc2510978e553536f Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 23 Nov 2016 09:55:11 +0100 Subject: [PATCH 33/38] CI move conda setup to travis.yml --- .travis.yml | 16 ++++++++++++++- ci_scripts/install.sh | 46 ------------------------------------------- 2 files changed, 15 insertions(+), 47 deletions(-) delete mode 100644 ci_scripts/install.sh diff --git a/.travis.yml b/.travis.yml index 935ecaa9bc..10be506a3c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,8 +42,22 @@ env: - TEST_DIR=/tmp/test_dir/ - MODULE=autosklearn +before_install: + - wget $MINICONDA_URL -O miniconda.sh + - bash miniconda.sh -b -p $HOME/miniconda + - export PATH="$HOME/miniconda/bin:$PATH" + - if [[ `which conda` ]]; then echo 'Conda installation successful'; else exit 1; fi + - conda update --yes conda + - conda create -n testenv --yes python=$PYTHON_VERSION pip wheel nose + - source activate testenv + - conda install --yes gcc + - echo "Using GCC at "`which gcc` + - export CC=`which gcc` + install: - - source ci_scripts/install.sh + - pip install coverage pep8 python-coveralls + - cat requirements.txt | xargs -n 1 -L 1 pip install + - python setup.py install script: bash ci_scripts/test.sh after_success: source ci_scripts/success.sh diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh deleted file mode 100644 index 14252b8683..0000000000 --- a/ci_scripts/install.sh +++ /dev/null @@ -1,46 +0,0 @@ -# Deactivate the travis-provided virtual environment and setup a -# conda-based environment instead -deactivate - -# Use the miniconda installer for faster download / install of conda -# itself -pushd . -cd -mkdir -p download -cd download -echo "Cached in $HOME/download :" -ls -l -echo -if [[ ! -f miniconda.sh ]] - then - wget $MINICONDA_URL -O miniconda.sh - fi -chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda -cd .. -export PATH=/home/travis/miniconda/bin:$PATH -if [[ `which conda` ]]; then echo 'Conda installation successful'; else exit 1; fi -conda update --yes conda -popd - -# Configure the conda environment and put it in the path using the -# provided versions -conda create -n testenv --yes python=$PYTHON_VERSION pip nose -source activate testenv - -# Install anaconda gcc compiler to have compiler compatible with the -# anaconda python executable -conda install gcc --yes -echo "Using GCC at "`which gcc` -export CC=`which gcc` - -# Install requirements in correct order -cat requirements.txt | xargs -n 1 -L 1 pip install - -if [[ "$COVERAGE" == "true" ]]; then - pip install coverage coveralls -fi - -python --version -python -c "import numpy; print('numpy %s' % numpy.__version__)" -python -c "import scipy; print('scipy %s' % scipy.__version__)" -python setup.py develop From bb12e84c10d90842b7303c8ee54149e985faeed6 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 23 Nov 2016 10:35:04 +0100 Subject: [PATCH 34/38] FIX pickling issue --- autosklearn/automl.py | 52 ++++++++++++++--------------- autosklearn/smbo.py | 1 + test/test_automl/test_estimators.py | 4 +-- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 3b9a0f768a..8eb484a8de 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -369,7 +369,7 @@ def _fit(self, datamanager): if time_left_for_smac <= 0: self._logger.warning("Not starting SMAC because there is no time " "left.") - self._proc_smac = None + _proc_smac = None else: if self._per_run_time_limit is None or \ self._per_run_time_limit > time_left_for_smac: @@ -380,25 +380,25 @@ def _fit(self, datamanager): else: per_run_time_limit = self._per_run_time_limit - self._proc_smac = AutoMLSMBO(config_space=self.configuration_space, - dataset_name=self._dataset_name, - backend=self._backend, - total_walltime_limit=time_left_for_smac, - func_eval_time_limit=per_run_time_limit, - memory_limit=self._ml_memory_limit, - data_memory_limit=self._data_memory_limit, - watcher=self._stopwatch, - start_num_run=num_run, - num_metalearning_cfgs=self._initial_configurations_via_metalearning, - config_file=configspace_path, - smac_iters=self._max_iter_smac, - seed=self._seed, - metadata_directory=self._metadata_directory, - resampling_strategy=self._resampling_strategy, - resampling_strategy_args=self._resampling_strategy_arguments, - acquisition_function=self.acquisition_function, - shared_mode=self._shared_mode) - self._proc_smac.run_smbo() + _proc_smac = AutoMLSMBO(config_space=self.configuration_space, + dataset_name=self._dataset_name, + backend=self._backend, + total_walltime_limit=time_left_for_smac, + func_eval_time_limit=per_run_time_limit, + memory_limit=self._ml_memory_limit, + data_memory_limit=self._data_memory_limit, + watcher=self._stopwatch, + start_num_run=num_run, + num_metalearning_cfgs=self._initial_configurations_via_metalearning, + config_file=configspace_path, + smac_iters=self._max_iter_smac, + seed=self._seed, + metadata_directory=self._metadata_directory, + resampling_strategy=self._resampling_strategy, + resampling_strategy_args=self._resampling_strategy_arguments, + acquisition_function=self.acquisition_function, + shared_mode=self._shared_mode) + self.runhistory_ = _proc_smac.run_smbo() self._proc_ensemble = None self._load_models() @@ -569,8 +569,8 @@ def grid_scores_(self): scores_per_config = defaultdict(list) config_list = list() - for run_key in self._proc_smac.runhistory.data: - run_value = self._proc_smac.runhistory.data[run_key] + for run_key in self.runhistory_.data: + run_value = self.runhistory_.data[run_key] config_id = run_key.config_id cost = run_value.cost @@ -583,7 +583,7 @@ def grid_scores_(self): for config_id in config_list: scores = [1 - score for score in scores_per_config[config_id]] mean_score = np.mean(scores) - config = self._proc_smac.runhistory.ids_config[config_id] + config = self.runhistory_.ids_config[config_id] grid_score = _CVScoreTuple(config.get_dictionary(), mean_score, scores) @@ -624,10 +624,10 @@ def cv_results_(self): mean_fit_time = [] params = [] status = [] - for run_key in self._proc_smac.runhistory.data: - run_value = self._proc_smac.runhistory.data[run_key] + for run_key in self.runhistory_.data: + run_value = self.runhistory_.data[run_key] config_id = run_key.config_id - config = self._proc_smac.runhistory.ids_config[config_id] + config = self.runhistory_.ids_config[config_id] param_dict = config.get_dictionary() params.append(param_dict) diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index ef1f41c7a1..a2b57ce4ce 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -616,6 +616,7 @@ def run_smbo(self, max_iters=1000): break self.runhistory = smac.solver.runhistory + return runhistory def choose_next(self, smac): challengers = [] diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index df2cf1a721..042503c704 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -179,7 +179,7 @@ def test_grid_scores(self): ensemble_size=0) cls_ = cls.build_automl() automl = cls_._automl - automl._proc_smac = unittest.mock.MagicMock() + automl.runhistory_ = unittest.mock.MagicMock() RunKey = collections.namedtuple( 'RunKey', ['config_id', 'instance_id', 'seed']) @@ -189,7 +189,7 @@ def test_grid_scores(self): runhistory = dict() runhistory[RunKey(1, 1, 1)] = RunValue(1, 1, 1, '') - automl._proc_smac.runhistory.data = runhistory + automl.runhistory_.data = runhistory grid_scores_ = automl.grid_scores_ self.assertIsInstance(grid_scores_[0], _CVScoreTuple) From 4e47005732a704270c7d2d3be83fbc22ae13d1ab Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 25 Nov 2016 12:09:41 +0100 Subject: [PATCH 35/38] MAINT bump SMAC version to 0.2.1 --- autosklearn/__init__.py | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py index ac55aa5e46..d5abaedf18 100644 --- a/autosklearn/__init__.py +++ b/autosklearn/__init__.py @@ -5,7 +5,7 @@ __MANDATORY_PACKAGES__ = ''' scikit-learn==0.17.1 -smac==0.2.0 +smac==0.2.1 lockfile>=0.10 ConfigSpace>=0.2.1 pyrfr==0.2.0 diff --git a/requirements.txt b/requirements.txt index c5390b1efb..b4fac235b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,4 @@ xgboost==0.4a30 ConfigSpace pynisher>=0.4 pyrfr -smac==0.2.0 +smac==0.2.1 diff --git a/setup.py b/setup.py index cf50e71ad9..a0d17359e0 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ "ConfigSpace", "pynisher>=0.4", "pyrfr", - "smac==0.2.0" + "smac==0.2.1" ] From 3754aaab3fa1e2270ed290179059cb3dba476a52 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 25 Nov 2016 12:10:05 +0100 Subject: [PATCH 36/38] FIX target algorithm runs are deterministic --- autosklearn/smbo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index a2b57ce4ce..10df064861 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -486,7 +486,8 @@ def run_smbo(self, max_iters=1000): #'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, 'shared-model': self.shared_mode, - 'run-obj': 'quality'}) + 'run-obj': 'quality', + 'deterministic': 'true'}) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior From 235062d8586c3a246499d2adf9f22cae08edea8c Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 25 Nov 2016 12:10:22 +0100 Subject: [PATCH 37/38] FIX syntax error in example --- example/example_holdout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/example_holdout.py b/example/example_holdout.py index cdbd9b450f..8c057a4f37 100644 --- a/example/example_holdout.py +++ b/example/example_holdout.py @@ -31,7 +31,7 @@ def main(): automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=120, per_run_time_limit=30, tmp_folder='/tmp/autoslearn_holdout_example_tmp', - output_folder='/tmp/autosklearn_holdout_example_out', + output_folder='/tmp/autosklearn_holdout_example_out') automl.fit(X_train, y_train, dataset_name='digits') # Print the best models together with their scores - if all scores are From bf9593dde746c5b73ea5207967d069d174baf306 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 28 Nov 2016 11:00:27 +0100 Subject: [PATCH 38/38] MAINT prepare release --- autosklearn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/__init__.py b/autosklearn/__init__.py index d5abaedf18..b4abe6ef18 100644 --- a/autosklearn/__init__.py +++ b/autosklearn/__init__.py @@ -1,7 +1,7 @@ # -*- encoding: utf-8 -*- from autosklearn.util import dependencies -__version__ = '0.1.0' +__version__ = '0.1.1' __MANDATORY_PACKAGES__ = ''' scikit-learn==0.17.1