diff --git a/azure-pipelines-steps.yml b/azure-pipelines-steps.yml index 7559b8d44..448484735 100644 --- a/azure-pipelines-steps.yml +++ b/azure-pipelines-steps.yml @@ -5,6 +5,7 @@ parameters: body: [] + package: '.' steps: - task: UsePythonVersion@0 @@ -23,7 +24,7 @@ steps: condition: and(succeeded(), eq(variables['Agent.OS'], 'Linux')) # Install the package -- script: 'python -m pip install --upgrade pip && pip install --upgrade setuptools && pip install .' +- script: 'python -m pip install --upgrade pip && pip install --upgrade setuptools && pip install ${{ parameters.package }}' displayName: 'Install dependencies' - ${{ parameters.body }} diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d2c10c12d..501ccd89b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -75,6 +75,7 @@ jobs: - script: 'python setup.py build_sphinx -b doctest' displayName: 'Run doctests' + package: '.[automl]' - job: 'Notebooks' dependsOn: 'EvalChanges' @@ -102,6 +103,41 @@ jobs: testRunTitle: 'Notebooks' condition: succeededOrFailed() +- job: 'AutoML' + dependsOn: 'EvalChanges' + condition: eq(dependencies.EvalChanges.outputs['output.testCode'], 'True') + variables: + python.version: '3.6' + pool: + vmImage: 'ubuntu-16.04' + steps: + - template: azure-pipelines-steps.yml + parameters: + body: + - task: AzureCLI@2 + displayName: 'AutoML tests' + inputs: + azureSubscription: 'automl' + scriptLocation: 'inlineScript' + scriptType: 'pscore' + powerShellIgnoreLASTEXITCODE: '' # string for now due to https://github.com/microsoft/azure-pipelines-tasks/issues/12266 + inlineScript: | + $env:SUBSCRIPTION_ID = az account show --query id -o tsv + python setup.py pytest + env: + WORKSPACE_NAME: 'testWorkspace' + RESOURCE_GROUP: 'testingAutoMLEconML' + PYTEST_ADDOPTS: '-m "automl" -n 0' + COVERAGE_PROCESS_START: 'setup.cfg' + + - task: PublishTestResults@2 + displayName: 'Publish Test Results **/test-results.xml' + inputs: + testResultsFiles: '**/test-results.xml' + testRunTitle: 'AutoML' + condition: succeededOrFailed() + package: '.[automl]' + - job: 'Linting' dependsOn: 'EvalChanges' condition: eq(dependencies.EvalChanges.outputs['output.testCode'], 'True') @@ -162,9 +198,8 @@ jobs: - script: 'python setup.py pytest' displayName: 'Unit tests' env: - PYTEST_ADDOPTS: '-m "not notebook"' + PYTEST_ADDOPTS: '-m "not (notebook or automl)"' COVERAGE_PROCESS_START: 'setup.cfg' - - task: PublishTestResults@2 displayName: 'Publish Test Results **/test-results.xml' inputs: diff --git a/doc/reference.rst b/doc/reference.rst index cb2b80356..92bc265d8 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -17,6 +17,7 @@ Public Module Reference econml.metalearners econml.two_stage_least_squares econml.utilities + econml.automated_ml Private Module Reference ======================== diff --git a/econml/automated_ml.py b/econml/automated_ml.py new file mode 100644 index 000000000..aaeab821c --- /dev/null +++ b/econml/automated_ml.py @@ -0,0 +1,363 @@ +# AzureML +from azureml.core.experiment import Experiment +from azureml.core import Workspace +from azureml.train.automl.automlconfig import AutoMLConfig +from azureml._base_sdk_common.common import ProjectSystemException +from sklearn.multioutput import MultiOutputRegressor +# helper imports +import time +import copy + +"""Automated Machine Learning Support For EconML Estimators. This allows analysts +to use AutomatedML to automate the process of selecting models for models Y, T, +and final of their causal inferenve estimator. +""" + +LINEAR_MODELS_SET = set([ + "ElasticNet", + "LassoLars", + "LinearRegressor", + "FastLinearRegressor", + "OnlineGradientDescentRegressor", + "SGDRegressor" +]) + + +SAMPLE_WEIGHTS_MODELS_SET = set([ + "ElasticNet", + "LightGBM", + "GradientBoostingRegressor", + "DecisionTreeRegressor", + "KNeighborsRegressor", + "LassoLars", + "SGDRegressor", + "RandomForestRegressor", + "ExtraTreesRegressor", + "LinearRegressor", + "FastLinearRegressor", + "OnlineGradientDescentRegressor" + ]) + + +def setAutomatedMLWorkspace(create_workspace=False, + create_resource_group=False, workspace_region=None, *, + subscription_id=None, resource_group=None, workspace_name=None, auth=None): + """Set configuration file for AutomatedML actions with the EconML library. If + ``create_workspace`` is set true, a new workspace is created + for the user. If ``create_workspace`` is set true, a new workspace is + created for the user. + + Parameters + ---------- + + create_workspace: Boolean, optional, default False + If set to true, a new workspace will be created if the specified + workspace does not exist. + + create_resource_group: Boolean, optional, default False + If set to true, a new resource_group will be created if the specified + resource_group does not exist. + + workspace_region: String, optional + Region of workspace, only necessary if create_new is set to true and a + new workspace is being created. + + auth: azureml.core.authentication.AbstractAuthentication, optional + If set EconML will use auth object for handling Azure Authentication. + Otherwise, EconML will use interactive automation, opening an + authentication portal in the browser. + + subscription_id: String, required + Definition of a class that will serve as the parent class of the + AutomatedMLMixin. This class must inherit from _BaseDMLCateEstimator. + + resource_group: String, required + Name of resource group of workspace to be created or set. + + workspace_name: String, required + Name of workspace of workspace to be created or set. + """ + try: + ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, + workspace_name=workspace_name, auth=auth) + # write the details of the workspace to a configuration file to the notebook library + ws.write_config() + print("Workspace configuration has succeeded.") + except ProjectSystemException: + if(create_workspace): + if(create_resource_group): + print("Workspace not accessible. Creating a new workspace and \ + resource group.") + ws = Workspace.create(name=workspace_name, + subscription_id=subscription_id, + resource_group=resource_group, + location=workspace_region, + create_resource_group=create_resource_group, + sku='basic', + auth=auth, + exist_ok=True) + ws.get_details() + else: + print("Workspace not accessible. Set \ + create_resource_group = True and run again to create a new \ + workspace and resource group.") + else: + print("Workspace not accessible. Set create_workspace = True \ + to create a new workspace.") + + +def addAutomatedML(baseClass): + """ + Enables base class to use EconAutoMLConfig objects instead of models + by adding the AutomatedMLMixin to specified base class. Once this Mixin + has been added, EconML classes can be initialized with EconAutoMLConfig + objects rather than scikit learn models. + + + Parameters + ---------- + + baseClass: Class, required + Definition of a class that will serve as the parent class of the + AutomatedMLMixin. + + Returns + ---------- + + automatedMLClass: Class + A modified version of ``baseClass`` that accepts the parameters of the + AutomatedML Mixin rather in addition to the original class objects. + + """ + + class AutomatedMLClass(AutomatedMLMixin, baseClass): + pass + return AutomatedMLClass + + +class AutomatedMLModel(): + def __init__(self, automl_config, workspace, experiment_name_prefix="aml_experiment"): + """ + scikit-learn style model fitted and specified with automatedML. + + automatedML uses AzureML's Automated Machine Learning library + to automatically preprocess data, specify features, and + selects a model given a pair of training data and labels. + + Parameters + ---------- + + automl_config: azureml.train.automl.automlconfig.AutoMLConfig, required + Configuration for submitting an Automated Machine Learning experiment in Azure Machine Learning. + This configuration object contains and persists the parameters for configuring the experiment + run parameters, as well as the training data to be used at run time. For guidance on selecting + your settings, you may refer to + https://docs.microsoft.com/azure/machine-learning/service/how-to-configure-auto-train. + + workspace: azureml.core.experiment.Experiment, optional + The main experiment to associated with the automatedML runs for + + experiment_name_prefix: String, optional + Prefix of experiment name for generated by SciKitAutoMLModel. The full name of + the experiment will be {EXPERIMENT_NAME_PREFIX}_{INITIALIZE_EXPERIMENT_TIMESTAMP}. + Must be comprised of alphanumeric characters, hyphens, underscores and have at most 18 characters. + """ + self._innerModel = _InnerAutomatedMLModel( + automl_config, workspace, experiment_name_prefix=experiment_name_prefix) + + def fit(self, X, y, sample_weight=None): + """ + Select and fit model. + + Parameters + ---------- + + X: numpy.ndarray or pandas.DataFrame, required + The training features to use when fitting pipelines during AutoML experiment. + + y: numpy.ndarray or pandas.DataFrame, required + Training labels to use when fitting pipelines during AutoML experiment. + + sample_weight: numpy.ndarray or pandas.DataFrame, optional + The weight to give to each training sample when running fitting pipelines, + each row should correspond to a row in X and y data. + + experiment_name_prefix: String, optional + Prefix of experiment name for generated by SciKitAutoMLModel. The full name of + the experiment will be {EXPERIMENT_NAME_PREFIX}_{INITIALIZE_EXPERIMENT_TIMESTAMP}. + Must be comprised of alphanumeric characters, hyphens, underscores and have at most 18 characters. + """ + # if y is a multioutput model + if y.ndim > 1: + # Make sure second dimension has 1 or more item + if y.shape[1] > 1: + # switch _inner Model to a MultiOutputRegressor + self._innerModel = MultiOutputRegressor(self._innerModel) + self._innerModel.fit(X, y, sample_weight=sample_weight) + return + else: + # flatten array as automl only takes vectors for y + y = y.flatten() + self._innerModel.fit(X, y, sample_weight=sample_weight) + + def predict(self, X): + """ + Predict using selected and fitted model. + + X: numpy.ndarray or pandas.DataFrame, required + The training features to use for predicting labels + """ + return self._innerModel.predict(X) + + def predict_proba(self, X): + """ + Predict using selected and fitted model. + + X: numpy.ndarray or pandas.DataFrame, required + The training features to use for predicting label probabilities. + """ + return self._innerModel.predict_proba(X) + + +class _InnerAutomatedMLModel(): + # Inner single model to be passed that wrapper can use to pass into MultiOutputRegressor + def __init__(self, automl_config, workspace, + experiment_name_prefix="aml_experiment"): + self._show_output = automl_config._show_output + self._workspace = workspace + self._automl_config = automl_config + self._experiment_name_prefix = experiment_name_prefix + + def get_params(self, deep=True): + # Must be implemented for MultiOutputRegressor to view _InnerAutomatedMLModel + # as an sklearn estimator + return { + 'workspace': self._workspace, + 'automl_config': self._automl_config, + 'experiment_name_prefix': self._experiment_name_prefix + } + + def fit(self, X, y, sample_weight=None): + # fit implementation for a single output model. + # Create experiment for specified workspace + automl_config = copy.deepcopy(self._automl_config) + current_time = time.localtime() + current_time_string = time.strftime('%y_%m_%d-%H_%M_%S', current_time) + experiment_name = self._experiment_name_prefix + "_" + current_time_string + self._experiment = Experiment(self._workspace, experiment_name) + # Configure automl_config with training set information. + automl_config.user_settings['X'] = X + automl_config.user_settings['y'] = y + automl_config.user_settings['sample_weight'] = sample_weight + # Wait for remote run to complete, the set the model + print("Experiment " + experiment_name + " has started.") + local_run = self._experiment.submit(automl_config, show_output=self._show_output) + print("Experiment " + experiment_name + " completed.") + _, self._model = local_run.get_output() + + def predict(self, X): + return self._model.predict(X) + + def predict_proba(self, X): + return self._model.predict_proba(X) + + +class AutomatedMLMixin(): + def __init__(self, *args, **kwargs): + """ + Mixin enabling users to leverage automatedML as their model of choice in + Double Machine Learners and Doubly Robust Learners. It instantiates + AutomatedMLModels for each automl_config provided and pass them as + parameters into its parent class. + + Parameters + ---------- + args: List, optional + args that are passed in order to initiate the final automatedML run. + Any arg, that is an AutoMLConfig, will be converted into as + AutomatedMLModel. + + kwargs: Dict, optional + kwargs that are passed in order to initiate the final automatedML run. + Any kwarg, that is an AutoMLConfig, will be converted into as + AutomatedMLModel. + """ + # Loop through the kwargs and args if any of them is an AutoMLConfig file, pass them + # create model and pass model into final. + new_args = () + for var in args: + # If item is an automl config, get its corresponding + # AutomatedML Model and add it to new_Args + if isinstance(var, EconAutoMLConfig): + var = self._get_automated_ml_model(kwarg, key) + new_args += (var,) + + for key in kwargs: + kwarg = kwargs[key] + # If item is an automl config, get its corresponding + # AutomatedML Model and set it for this key in + # kwargs + if isinstance(kwarg, EconAutoMLConfig): + kwargs[key] = self._get_automated_ml_model(kwarg, key) + + super().__init__(*new_args, **kwargs) + + def _get_automated_ml_model(self, automl_config, prefix): + # takes in either automated_ml config and instantiates + # an AutomatedMLModel + # The prefix can only be 18 characters long + # because prefixes come from kwarg_names, we must ensure they are + # short enough. + prefix = prefix[:18] + # Get workspace from config file. + workspace = Workspace.from_config() + return AutomatedMLModel(automl_config, workspace, + experiment_name_prefix=prefix) + + +class EconAutoMLConfig(AutoMLConfig): + + def __init__(self, sample_weights_required=False, linear_model_required=False, show_output=False, **kwargs): + """ + Azure AutoMLConfig object with added guards to ensure correctness when used + with EconML + + Parameters + ---------- + + sample_weights_required: Boolean, optional, default False + If set true, only models that require sample weights will be selected during + AutomatedML. + + linear_model_required: Boolean, optional, default False + If set to true, only linear models will be selected during AutomatedML. + + show_output: Boolean, optional, default False + If set to true, outputs for the corresponding AutomatedMLModel + will be shown when it is fitted. + + kwargs: list, optional + List of kwargs to be passed to a correspodning AutoML Config object. + To view the full documentation of the kwargs, you may refer to + https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client + /azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py + + """ + whitelist_models = None + if linear_model_required and sample_weights_required: + # Take the intersect of the white for sample + # weights and linear models + whitelist_models = list(LINEAR_MODELS_SET.intersection(SAMPLE_WEIGHTS_MODELS_SET)) + + else: + if(linear_model_required): + whitelist_models = list(LINEAR_MODELS_SET) + if(sample_weights_required): + whitelist_models = list(SAMPLE_WEIGHTS_MODELS_SET) + + kwargs['whitelist_models'] = whitelist_models + + # show output is not stored in the config in AutomatedML, so we need to make it a field. + self._show_output = show_output + + super().__init__(**kwargs) diff --git a/econml/tests/test_automated_ml.py b/econml/tests/test_automated_ml.py new file mode 100644 index 000000000..1e0bae952 --- /dev/null +++ b/econml/tests/test_automated_ml.py @@ -0,0 +1,208 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import unittest +import pytest +from sklearn.linear_model import LinearRegression, Lasso, \ + LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, \ + PolynomialFeatures +from sklearn.model_selection import KFold +from econml.dml import * +from econml.metalearners import * +from econml.drlearner import DRLearner +import numpy as np +from econml.utilities import shape, hstack, vstack, reshape, \ + cross_product +from econml.inference import BootstrapInference +from contextlib import ExitStack +from sklearn.ensemble import RandomForestRegressor, \ + GradientBoostingRegressor, GradientBoostingClassifier +import itertools +from econml.sklearn_extensions.linear_model import WeightedLasso +from econml.tests.test_statsmodels import _summarize +import econml.tests.utilities # bugfix for assertWarns +import copy +import logging +from econml.data.dgps import ihdp_surface_B +import os + +try: + from azureml.train.automl.exceptions import ClientException + from azureml.core.authentication import AzureCliAuthentication + from econml.automated_ml import * + AutomatedTLearner = addAutomatedML(TLearner) + AutomatedSLearner = addAutomatedML(SLearner) + AutomatedXLearner = addAutomatedML(XLearner) + AutomatedDomainAdaptationLearner = \ + addAutomatedML(DomainAdaptationLearner) + AutomatedDRLearner = addAutomatedML(DRLearner) + AutomatedDMLCateEstimator = addAutomatedML(DMLCateEstimator) + AutomatedLinearDMLCateEstimator = addAutomatedML(LinearDMLCateEstimator) + AutomatedSparseLinearDMLCateEstimator = \ + addAutomatedML(SparseLinearDMLCateEstimator) + AutomatedKernelDMLCateEstimator = addAutomatedML(KernelDMLCateEstimator) + AutomatedNonParamDMLCateEstimator = \ + addAutomatedML(NonParamDMLCateEstimator) + AutomatedForestDMLCateEstimator = addAutomatedML(ForestDMLCateEstimator) + + AUTOML_SETTINGS_REG = { + 'experiment_timeout_minutes': 1, + 'enable_early_stopping': True, + 'iteration_timeout_minutes': 1, + 'max_cores_per_iteration': 1, + 'n_cross_validations': 2, + 'preprocess': False, + 'featurization': 'off', + 'enable_stack_ensemble': False, + 'enable_voting_ensemble': False, + 'primary_metric': 'normalized_mean_absolute_error', + } + + AUTOML_SETTINGS_CLF = { + 'experiment_timeout_minutes': 1, + 'enable_early_stopping': True, + 'iteration_timeout_minutes': 1, + 'max_cores_per_iteration': 1, + 'n_cross_validations': 2, + 'enable_stack_ensemble': False, + 'preprocess': False, + 'enable_voting_ensemble': False, + 'featurization': 'off', + 'primary_metric': 'AUC_weighted', + } + + AUTOML_CONFIG_REG = EconAutoMLConfig(task='regression', + debug_log='automl_errors.log', + enable_onnx_compatible_models=True, model_explainability=True, + **AUTOML_SETTINGS_REG) + + AUTOML_CONFIG_CLF = EconAutoMLConfig(task='classification', + debug_log='automl_errors.log', + enable_onnx_compatible_models=True, model_explainability=True, + **AUTOML_SETTINGS_CLF) + + AUTOML_CONFIG_LINEAR_REG = EconAutoMLConfig(task='regression', + debug_log='automl_errors.log', + linear_model_required=True, + enable_onnx_compatible_models=True, model_explainability=True, + **AUTOML_SETTINGS_REG) + + AUTOML_CONFIG_SAMPLE_WEIGHT_REG = EconAutoMLConfig(task='regression', + debug_log='automl_errors.log', + linear_model_required=True, + enable_onnx_compatible_models=True, model_explainability=True, + **AUTOML_SETTINGS_REG) + + def automl_model_reg(): + return copy.deepcopy(AUTOML_CONFIG_REG) + + def automl_model_clf(): + return copy.deepcopy(AUTOML_CONFIG_CLF) + + # Linear models are required for parametric dml + + def automl_model_linear_reg(): + return copy.deepcopy(AUTOML_CONFIG_LINEAR_REG) + + # sample weighting models are required for nonparametric dml + + def automl_model_sample_weight_reg(): + return copy.deepcopy(AUTOML_CONFIG_SAMPLE_WEIGHT_REG) + + # Test values + Y, T, X, _ = ihdp_surface_B() +except ImportError: + pass # automl not installed + + +@pytest.mark.automl +class TestAutomatedDML(unittest.TestCase): + + @classmethod + def setUpClass(cls): + subscription_id = os.getenv("SUBSCRIPTION_ID") + resource_group = os.getenv("RESOURCE_GROUP") + workspace_name = os.getenv("WORKSPACE_NAME") + + auth = AzureCliAuthentication() + + setAutomatedMLWorkspace(auth=auth, + subscription_id=subscription_id, + resource_group=resource_group, workspace_name=workspace_name) + + def test_nonparam(self): + """Testing the completion of the fit and effect estimation of an automated Nonparametic DMLCateEstimator""" + Y, T, X, _ = ihdp_surface_B() + est = AutomatedNonParamDMLCateEstimator(model_y=automl_model_reg(), + model_t=automl_model_clf(), + model_final=automl_model_sample_weight_reg(), featurizer=None, + discrete_treatment=True) + est.fit(Y, T, X) + _ = est.effect(X) + + def test_param(self): + """Testing the completion of the fit and effect estimation of an automated Parametric DMLCateEstimator""" + Y, T, X, _ = ihdp_surface_B() + est = AutomatedLinearDMLCateEstimator(model_y=automl_model_reg(), + model_t=GradientBoostingClassifier(), + featurizer=None, + discrete_treatment=True) + est.fit(Y, T, X) + _ = est.effect(X) + + def test_forest_dml(self): + """Testing the completion of the fit and effect estimation of an AutomatedForestDMLCateEstimator""" + + Y, T, X, _ = ihdp_surface_B() + est = AutomatedForestDMLCateEstimator(model_y=automl_model_reg(), + model_t=GradientBoostingClassifier(), + discrete_treatment=True, + n_estimators=1000, + subsample_fr=.8, + min_samples_leaf=10, + min_impurity_decrease=0.001, + verbose=0, min_weight_fraction_leaf=.01) + est.fit(Y, T, X) + _ = est.effect(X) + + +@pytest.mark.automl +class TestAutomatedMetalearners(unittest.TestCase): + + def test_TLearner(self): + """Testing the completion of the fit and effect estimation of an AutomatedTLearner""" + # TLearner test + # Instantiate TLearner + Y, T, X, _ = ihdp_surface_B() + est = AutomatedTLearner(models=automl_model_reg()) + + # Test constant and heterogeneous treatment effect, single and multi output y + + est.fit(Y, T, X) + _ = est.effect(X) + + def test_SLearner(self): + """Testing the completion of the fit and effect estimation of an AutomatedSLearner""" + # Test constant treatment effect with multi output Y + # Test heterogeneous treatment effect + # Need interactions between T and features + Y, T, X, _ = ihdp_surface_B() + est = AutomatedSLearner(overall_model=automl_model_reg()) + + est.fit(Y, T, X) + _ = est.effect(X) + + # Test heterogeneous treatment effect with multi output Y + + def test_DALearner(self): + """Testing the completion of the fit and effect estimation of an AutomatedDomainAdaptationLearner""" + + # Instantiate DomainAdaptationLearner + + est = AutomatedDomainAdaptationLearner(models=automl_model_reg(), + final_models=automl_model_reg()) + + est.fit(Y, T, X) + _ = est.effect(X) diff --git a/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb b/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb new file mode 100644 index 000000000..6e2eb8179 --- /dev/null +++ b/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb @@ -0,0 +1,1318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Machine Learning For EconML\n", + "\n", + "This is a version of the evaluation for running in the cloud, it has settings that leverage greater access to compute." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\v-keacqu\\AppData\\Local\\Continuum\\miniconda3\\envs\\azure_automl\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", + " warnings.warn(msg, category=DeprecationWarning)\n" + ] + } + ], + "source": [ + "import logging\n", + "import sys\n", + "# Main imports\n", + "from econml.dml import DMLCateEstimator, LinearDMLCateEstimator,SparseLinearDMLCateEstimator,ForestDMLCateEstimator, NonParamDMLCateEstimator, KernelDMLCateEstimator\n", + "from matplotlib import pyplot as plt\n", + "import pandas as pd\n", + "import os\n", + "import copy\n", + "\n", + "import azureml.core\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.core.workspace import Workspace\n", + "from azureml.automl.core.featurization import FeaturizationConfig\n", + "from azureml.core.dataset import Dataset\n", + "from azureml.explain.model._internal.explanation_client import ExplanationClient\n", + "\n", + "#Import scikit-learn model AutoML wrapper\n", + "from econml.automated_ml import addAutomatedML,setAutomatedMLWorkspace, EconAutoMLConfig\n", + "# Imports for linear double machine learning evaluation\n", + "from econml.dml import LinearDMLCateEstimator\n", + "from matplotlib import pyplot as plt\n", + "import pandas as pd\n", + "import os\n", + "# Helper imports\n", + "import math\n", + "import numpy as np\n", + "from itertools import product\n", + "from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV,LinearRegression,MultiTaskElasticNet,MultiTaskElasticNetCV\n", + "from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn.model_selection import train_test_split\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AutoML Setup\n", + "\n", + "To begin we will configure our AutoML instance and load environment variable necessary to leverage AutoML." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Workspace configuration has succeeded.\n" + ] + } + ], + "source": [ + "setAutomatedMLWorkspace(workspace_name = \"\",\n", + " subscription_id=\"\",\n", + " resource_group=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first step for using AutoML with double machine learning is to specify the configuration object of your AutoML target, this `AutoMLConfig` object specifies the settings of the AutoML session that will be used to develop your model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "automl_settings_reg = {\n", + " \"experiment_timeout_minutes\" : 1,\n", + " \"enable_early_stopping\" : True,\n", + " \"iteration_timeout_minutes\": 1,\n", + " \"max_cores_per_iteration\": -1,\n", + " \"n_cross_validations\": 2,\n", + " 'preprocess': False,\n", + " \"featurization\": 'off',\n", + " \"verbosity\": logging.INFO,\n", + " \"primary_metric\": 'normalized_mean_absolute_error'\n", + "}\n", + "\n", + "automl_settings_clf = {\n", + " \"experiment_timeout_minutes\" : 1,\n", + " \"enable_early_stopping\" : True,\n", + " \"iteration_timeout_minutes\": 1,\n", + " \"max_cores_per_iteration\": -1,\n", + " \"n_cross_validations\": 2,\n", + " 'preprocess': False,\n", + " \"featurization\": 'off',\n", + " \"verbosity\": logging.INFO,\n", + " \"primary_metric\": 'AUC_weighted'\n", + "}\n", + "\n", + "automl_config_reg = EconAutoMLConfig(task = 'regression',\n", + " debug_log = 'automl_errors.log',\n", + " enable_onnx_compatible_models=True,\n", + " model_explainability=True,\n", + " **automl_settings_reg\n", + " )\n", + "\n", + "automl_config_clf = EconAutoMLConfig(task = 'classification',\n", + " debug_log = 'automl_errors.log',\n", + " enable_onnx_compatible_models=True,\n", + " model_explainability=True,\n", + " **automl_settings_clf\n", + " )\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grid Search Baseline\n", + "\n", + "In order to evaluate our model against a well performing Random Forest Classifier, we need to specify a class that completes grid search CV over the RandomForestRegressor class and chooses the best resulting model. Below is the class definition and as well as two funtions for instantiating our RandomForestRegressor wrapper." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "\n", + "class GridSearchCVList:\n", + "\n", + " def __init__(self, estimator_list, param_grid_list, scoring=None,\n", + " n_jobs=None, iid='warn', refit=True, cv='warn', verbose=0, pre_dispatch='2*n_jobs',\n", + " error_score='raise-deprecating', return_train_score=False):\n", + " self._gcv_list = [GridSearchCV(estimator, param_grid, scoring=scoring,\n", + " n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,\n", + " pre_dispatch=pre_dispatch, error_score=error_score,\n", + " return_train_score=return_train_score)\n", + " for estimator, param_grid in zip(estimator_list, param_grid_list)]\n", + " return\n", + "\n", + " def fit(self, X, y, sample_weight = None, **fit_params):\n", + " self.best_ind_ = np.argmax([gcv.fit(X, y, sample_weight = sample_weight, **fit_params).best_score_ for gcv in self._gcv_list])\n", + " self.best_estimator_ = self._gcv_list[self.best_ind_].best_estimator_\n", + " self.best_score_ = self._gcv_list[self.best_ind_].best_score_\n", + " self.best_params_ = self._gcv_list[self.best_ind_].best_params_\n", + " return self\n", + "\n", + " def predict(self, X):\n", + " return self.best_estimator_.predict(X)\n", + "\n", + " def predict_proba(self, X):\n", + " return self.best_estimator_.predict_proba(X)\n", + "\n", + "from sklearn.linear_model import LassoCV, LogisticRegressionCV\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", + "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n", + "\n", + "def grid_search_reg():\n", + " return GridSearchCVList([RandomForestRegressor(n_estimators=100, random_state=123),\n", + " GradientBoostingRegressor(random_state=123)],\n", + " param_grid_list=[{'max_depth': [3, None],\n", + " 'min_samples_leaf': [10, 50]},\n", + " {'n_estimators': [50, 100],\n", + " 'max_depth': [3],\n", + " 'min_samples_leaf': [10, 30]}],\n", + " cv=3,\n", + " iid=True,\n", + " scoring='neg_mean_squared_error')\n", + "\n", + "\n", + "def grid_search_clf():\n", + " return GridSearchCVList([RandomForestClassifier(n_estimators=100, random_state=123),\n", + " GradientBoostingClassifier(random_state=123)],\n", + " param_grid_list=[{'max_depth': [3, 5],\n", + " 'min_samples_leaf': [10, 50]},\n", + " {'n_estimators': [50, 100],\n", + " 'max_depth': [3],\n", + " 'min_samples_leaf': [10, 30]}],\n", + " cv=3,\n", + " iid=True,\n", + " scoring='neg_mean_squared_error')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DGP \n", + "The DGP is described by the following equations:\n", + "\n", + "\\begin{align}\n", + "T =& g(W \\cdot \\beta) + \\eta, & \\;\\eta \\sim \\text{Uniform}(-1, 1)\\\\\n", + "Y =& T\\cdot \\theta + m(W\\cdot \\beta) + \\epsilon, &\\; \\epsilon \\sim \\text{Uniform}(-1, 1)\\\\\n", + "W \\sim& \\text{Normal}(0,\\, I_{n_w})\\\\\n", + "X \\sim& \\text{Uniform}(0,1)^{n_x}\n", + "\\end{align}\n", + "\n", + "where $W$ is a matrix of high-dimensional confounders, $m, g$ can be nonlinear, and $\\beta, \\gamma$ have high sparsity.\n", + "\n", + "For this DGP, \n", + "\\begin{align}\n", + "\\theta(x) = \\exp(2\\cdot x_1)\\\\\n", + "g(x) = sin^2(x)\\\\\n", + "m(x;\\eta,\\gamma)=\\frac{1}{2\\pi}\\frac{sinh(\\gamma)}{cosh(\\gamma)-cos(x-\\eta)}\n", + "\\end{align}\n", + "\n", + "Let’s keep it simple and set $\\eta = 0$ and $\\gamma = 1$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structural Model for Estimation\n", + "\n", + "The model we are trying to estimate is as follows\n", + "\n", + "$$t_i = g(w_i\\cdot b) + \\eta$$\n", + "$$y_i = t_i\\cdot \\theta + m(w_i\\cdot b) + \\epsilon$$\n", + "\n", + "For this example the true value of the causal parameter will be $\\theta = 0.5$" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Treatment effect function\n", + "def te(x):\n", + " return np.sin(2*math.pi*x[0])/2+0.5\n", + "def g(x):\n", + " return np.power(np.sin(x),2)\n", + "def m(x,nu=0.,gamma=1.):\n", + " return 0.5/math.pi*(np.sinh(gamma))/(np.cosh(gamma)-np.cos(x-nu))\n", + "#vectorized g and m for applying to dataset\n", + "vg = np.vectorize(g)\n", + "vm = np.vectorize(m)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# DGP constants\n", + "np.random.seed(123)\n", + "n = 10000\n", + "n_w = 30\n", + "support_size = 5\n", + "n_x = 1\n", + "# Outcome support\n", + "support_Y = np.random.choice(np.arange(n_w), size=support_size, replace=False)\n", + "coefs_Y = np.random.uniform(0, 1, size=support_size)\n", + "epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + "# Treatment support\n", + "support_T = support_Y\n", + "coefs_T = np.random.uniform(0, 1, size=support_size)\n", + "eta_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + "\n", + "# Generate controls, covariates, treatments and outcomes\n", + "W = np.random.normal(0, 1, size=(n, n_w))\n", + "X = np.random.uniform(0, 1, size=(n, n_x))\n", + "# Heterogeneous treatment effects\n", + "TE = np.array([te(x_i) for x_i in X])\n", + "\n", + "T = vg(np.dot(W[:, support_T], coefs_T))+ eta_sample(n)\n", + "Y = TE * T + vm(np.dot(W[:, support_Y], coefs_Y))+ epsilon_sample(n)\n", + "\n", + "Y_train, Y_val, T_train, T_val, X_train, X_val, W_train, W_val = train_test_split(Y, T, X, W, test_size=.2)\n", + "# Generate test data\n", + "X_test = np.array(list(product(np.arange(0, 1, 0.01), repeat=n_x)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train Estimators\n", + "We train models in three different ways, and compare their performance.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Default Setting" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "est = LinearDMLCateEstimator(model_y=RandomForestRegressor(),\n", + " model_t=RandomForestRegressor(),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_rf_regressor = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "est = LinearDMLCateEstimator(model_y=grid_search_reg(),\n", + " model_t=grid_search_reg(),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_cv_regressor = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_31-16_34_42 has started.\n", + "Experiment model_t_20_01_31-16_34_42 completed.\n", + "Experiment model_y_20_01_31-16_36_44 has started.\n" + ] + } + ], + "source": [ + "AutomatedLinearDMLCateEstimator = addAutomatedML(LinearDMLCateEstimator)\n", + "est = AutomatedLinearDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_reg,\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_aml_regressor = est.effect(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,6))\n", + "expected_te = np.array([te(x_i) for x_i in X_test])\n", + "plt.plot(X_test, expected_te, 'b--', label='True effect')\n", + "plt.plot(X_test, te_pred_rf_regressor, label='DML Polynomial Features with RF Regressor')\n", + "plt.plot(X_test, te_pred_aml_regressor, label='DML Polynomial Features with AML Regressor')\n", + "plt.ylabel('Treatment Effect')\n", + "plt.xlabel('x')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Polynomial Features with Regularization" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "est = DMLCateEstimator(model_y=RandomForestRegressor(),\n", + " model_t=RandomForestRegressor(),\n", + " model_final=Lasso(alpha=0.0001, fit_intercept=False),\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_rf_regressor_2=est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "est = DMLCateEstimator(model_y=grid_search_reg(),\n", + " model_t=grid_search_reg(),\n", + " model_final=Lasso(alpha=0.0001, fit_intercept=False),\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_cv_regressor_2 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_28-13_56_29 has started.\n", + "Experiment model_t_20_01_28-13_56_29 completed.\n", + "Experiment model_y_20_01_28-13_58_02 has started.\n", + "Experiment model_y_20_01_28-13_58_02 completed.\n", + "Experiment model_t_20_01_28-13_59_42 has started.\n", + "Experiment model_t_20_01_28-13_59_42 completed.\n", + "Experiment model_y_20_01_28-14_01_18 has started.\n", + "Experiment model_y_20_01_28-14_01_18 completed.\n" + ] + } + ], + "source": [ + "AutomatedDMLCateEstimator = addAutomatedML(DMLCateEstimator)\n", + "est = AutomatedDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_reg,\n", + " model_final=Lasso(alpha=0.0001, fit_intercept=False),\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_aml_regressor_2=est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,6))\n", + "plt.plot(X_test, te_pred_rf_regressor_2, label='DML Polynomial Features with RF Regressor')\n", + "#plt.plot(X_test, te_pred_cv_regressor_2, label='DML Polynomial Features with Grid Search Regressor')\n", + "plt.plot(X_test, te_pred_aml_regressor_2, label='DML Polynomial Features with AML Regressor')\n", + "expected_te = np.array([te(x_i) for x_i in X_test])\n", + "plt.plot(X_test, expected_te, 'b--', label='True effect')\n", + "plt.ylabel('Treatment Effect')\n", + "plt.xlabel('x')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Forest Final Stage\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "est = ForestDMLCateEstimator(model_y=RandomForestRegressor(),\n", + " model_t=RandomForestRegressor(),\n", + " discrete_treatment=False,\n", + " n_estimators=1000,\n", + " subsample_fr=.8,\n", + " min_samples_leaf=10,\n", + " min_impurity_decrease=0.001,\n", + " verbose=0, min_weight_fraction_leaf=.01)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_rf_regressor_3 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "est = ForestDMLCateEstimator(model_y=grid_search_reg(),\n", + " model_t=grid_search_reg(),\n", + " discrete_treatment=False,\n", + " n_estimators=1000,\n", + " subsample_fr=.8,\n", + " min_samples_leaf=10,\n", + " min_impurity_decrease=0.001,\n", + " verbose=0, min_weight_fraction_leaf=.01)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pcdred_cv_regressor_3 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_28-14_03_08 has started.\n", + "Experiment model_t_20_01_28-14_03_08 completed.\n", + "Experiment model_y_20_01_28-14_04_43 has started.\n", + "Experiment model_y_20_01_28-14_04_43 completed.\n", + "Experiment model_t_20_01_28-14_06_20 has started.\n", + "Experiment model_t_20_01_28-14_06_20 completed.\n", + "Experiment model_y_20_01_28-14_07_56 has started.\n", + "Experiment model_y_20_01_28-14_07_56 completed.\n" + ] + } + ], + "source": [ + "AutomatedForestDMLCateEstimator = addAutomatedML(ForestDMLCateEstimator)\n", + "est = AutomatedForestDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_reg,\n", + " discrete_treatment=False,\n", + " n_estimators=1000,\n", + " subsample_fr=.8,\n", + " min_samples_leaf=10,\n", + " min_impurity_decrease=0.001,\n", + " verbose=0, min_weight_fraction_leaf=.01)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_aml_regressor_3 = est.effect(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,6))\n", + "expected_te = np.array([te(x_i) for x_i in X_test])\n", + "plt.plot(X_test, expected_te, 'b--', label='True effect')\n", + "plt.plot(X_test, te_pred_rf_regressor_3, label='DML Polynomial Features with RF Regressor')\n", + "#plt.plot(X_test, te_pred_cv_regressor_3, label='DML Polynomial Features with Grid Search Regressor')\n", + "plt.plot(X_test, te_pred_aml_regressor_3, label='DML Polynomial Features with AML Regressor')\n", + "plt.ylabel('Treatment Effect')\n", + "plt.xlabel('x')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AutoML Final Stage Parametric Estimator\n", + "\n", + "For correctness, all parametric double machine learning estimators (all estimators besides ``NonParamDMLEstimator``) must be trained on linear models. We can specify that using a the parameter ``linear_model_required``.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "automl_settings_reg = {\n", + " \"experiment_timeout_minutes\" : 10,\n", + " \"enable_early_stopping\" : True,\n", + " \"iteration_timeout_minutes\": 2,\n", + " \"max_cores_per_iteration\": -1,\n", + " \"n_cross_validations\": 2,\n", + " 'preprocess': False,\n", + " \"featurization\": 'off',\n", + " \"verbosity\": logging.INFO,\n", + " \"primary_metric\": 'normalized_mean_absolute_error'\n", + "}\n", + "automl_config_final = EconAutoMLConfig(task = 'regression',\n", + " debug_log = 'automl_errors.log',\n", + " #For correctness, parametric DML must be linear models\n", + " linear_model_required = True,\n", + " **automl_settings_reg\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "est = DMLCateEstimator(model_y=RandomForestRegressor(),\n", + " model_t=RandomForestRegressor(),\n", + " model_final=RandomForestRegressor(),\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_rf_regressor_4 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "est = DMLCateEstimator(model_y=grid_search_reg(),\n", + " model_t=grid_search_reg(),\n", + " model_final=grid_search_reg(),\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_cv_regressor_4 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_28-15_50_01 has started.\n", + "Experiment model_t_20_01_28-15_50_01 completed.\n", + "Experiment model_y_20_01_28-15_51_44 has started.\n", + "Experiment model_y_20_01_28-15_51_44 completed.\n", + "Experiment model_t_20_01_28-15_53_26 has started.\n", + "Experiment model_t_20_01_28-15_53_26 completed.\n", + "Experiment model_y_20_01_28-15_55_30 has started.\n", + "Experiment model_y_20_01_28-15_55_30 completed.\n", + "Experiment model_final_20_01_28-15_57_16 has started.\n", + "Experiment model_final_20_01_28-15_57_16 completed.\n" + ] + } + ], + "source": [ + "AutomatedDMLCateEstimator = addAutomatedML(DMLCateEstimator)\n", + "est = AutomatedDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_reg,\n", + " model_final=automl_config_final,\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_aml_regressor_4 = est.effect(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,6))\n", + "plt.plot(X_test, te_pred_rf_regressor_4, label='DML Polynomial Features with RF Regressor')\n", + "plt.plot(X_test, te_pred_cv_regressor_4, label='DML Polynomial Features with Grid Search Regressor')\n", + "plt.plot(X_test, te_pred_aml_regressor_4, label='DML Polynomial Features with AML Regressor')\n", + "expected_te = np.array([te(x_i) for x_i in X_test])\n", + "plt.plot(X_test, expected_te, 'b--', label='True effect')\n", + "plt.ylabel('Treatment Effect')\n", + "plt.xlabel('x')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## With Nonlinear Models Allowed" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_28-14_48_12 has started.\n", + "Experiment model_t_20_01_28-14_48_12 completed.\n", + "Experiment model_y_20_01_28-14_50_04 has started.\n", + "Experiment model_y_20_01_28-14_50_04 completed.\n", + "Experiment model_t_20_01_28-14_51_45 has started.\n", + "Experiment model_t_20_01_28-14_51_45 completed.\n", + "Experiment model_y_20_01_28-14_53_35 has started.\n", + "Experiment model_y_20_01_28-14_53_35 completed.\n", + "Experiment model_final_20_01_28-14_55_18 has started.\n", + "Experiment model_final_20_01_28-14_55_18 completed.\n" + ] + } + ], + "source": [ + "AutomatedDMLCateEstimator = addAutomatedML(DMLCateEstimator)\n", + "est = AutomatedDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_reg,\n", + " model_final=automl_config_reg,\n", + " featurizer=PolynomialFeatures(degree=5),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_aml_regressor_4 = est.effect(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,6))\n", + "plt.plot(X_test, te_pred_rf_regressor_4, label='DML Polynomial Features with RF Regressor')\n", + "plt.plot(X_test, te_pred_aml_regressor_4, label='DML Polynomial Features with AML Regressor')\n", + "expected_te = np.array([te(x_i) for x_i in X_test])\n", + "plt.plot(X_test, expected_te, 'b--', label='True effect')\n", + "plt.ylabel('Treatment Effect')\n", + "plt.xlabel('x')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AutoML Final Stage Nonparametric Estimator\n", + "\n", + "For correctness, all nonparametric double machine learning estimators (all estimators besides ``NonParamDMLEstimator``) must be trained on models that enable sample weights. We can specify that using a the parameter ``sample_weights_required``.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "automl_config_final = EconAutoMLConfig(task = 'regression',\n", + " debug_log = 'automl_errors.log',\n", + " enable_onnx_compatible_models=True,\n", + " model_explainability=True,\n", + " #For correctness, parametric DML must have sample weight\n", + " sample_weights_required = True,\n", + " **automl_settings_reg\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "est = NonParamDMLCateEstimator(model_y=RandomForestRegressor(),\n", + " model_t=RandomForestRegressor(),\n", + " model_final=RandomForestRegressor(),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_rf_regressor_6 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "est = NonParamDMLCateEstimator(model_y=grid_search_reg(),\n", + " model_t=grid_search_reg(),\n", + " model_final=grid_search_reg(),\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_cv_regressor_6 = est.effect(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_28-14_36_01 has started.\n", + "Experiment model_t_20_01_28-14_36_01 completed.\n", + "Experiment model_y_20_01_28-14_37_57 has started.\n", + "Experiment model_y_20_01_28-14_37_57 completed.\n", + "Experiment model_t_20_01_28-14_39_43 has started.\n", + "Experiment model_t_20_01_28-14_39_43 completed.\n", + "Experiment model_y_20_01_28-14_41_26 has started.\n", + "Experiment model_y_20_01_28-14_41_26 completed.\n", + "Experiment model_final_20_01_28-14_43_03 has started.\n", + "Experiment model_final_20_01_28-14_43_03 completed.\n" + ] + } + ], + "source": [ + "AutomatedNonParamDMLCateEstimator = addAutomatedML(NonParamDMLCateEstimator)\n", + "est = AutomatedNonParamDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_reg,\n", + " model_final=automl_config_final,\n", + " random_state=123)\n", + "est.fit(Y_train, T_train, X_train, W_train)\n", + "te_pred_aml_regressor_6 = est.effect(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,6))\n", + "plt.plot(X_test, te_pred_cv_regressor_6, label='DML Polynomial Features with Grid Search Regressor')\n", + "plt.plot(X_test, te_pred_aml_regressor_6, label='DML Polynomial Features with AML Regressor')\n", + "expected_te = np.array([te(x_i) for x_i in X_test])\n", + "plt.plot(X_test, expected_te, 'b--', label='True effect')\n", + "plt.ylabel('Treatment Effect')\n", + "plt.xlabel('x')\n", + "plt.ylim(-0.5, 1.5) \n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Automated ML for Metalearners\n", + "\n", + "### DGP\n", + "\n", + "We use the Response Surface B from [Hill (2011)](https://www.tandfonline.com/doi/pdf/10.1198/jcgs.2010.08162) to generate sythetic outcome surfaces from real-world covariates and treatment assignments (Infant Health Development Program data). Since the original data was part of a randomized trial, a subset of the treated infants (those with non-white mothers) has been removed from the data in order to mimic the observational data setting. For more details, see [Hill (2011)](https://www.tandfonline.com/doi/pdf/10.1198/jcgs.2010.08162).\n", + "\n", + "\n", + "The DGP is described by the following equations:\n", + "\n", + "$\n", + "Y(0) = e^{(X+W)\\beta} + \\epsilon_0, \\;\\epsilon_0 \\sim N(0, 1)\\\\\n", + "Y(1) = X\\beta - \\omega + \\epsilon_1, \\;\\epsilon_1 \\sim N(0, 1)\\\\\n", + "$\n", + "\n", + "where $X$ is a covariate matrix, $W$ is a constant matrix with entries equal to $0.5$ and $w$ is a constant calculated such that the CATT equals $4$." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner\n", + "from econml.data.dgps import ihdp_surface_B\n", + "Y, T, X, expected_te = ihdp_surface_B()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate T learner\n", + "T_learner_rf = TLearner(grid_search_reg())\n", + "# Train T_learner\n", + "T_learner_rf.fit(Y, T, X)\n", + "# Estimate treatment effects on test data\n", + "T_te_rf = T_learner_rf.effect(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment models_20_01_27-12_04_27 has started.\n", + "Experiment models_20_01_27-12_04_27 completed.\n", + "Experiment models_20_01_27-12_06_04 has started.\n", + "Experiment models_20_01_27-12_06_04 completed.\n" + ] + } + ], + "source": [ + "# Instantiate T learner\n", + "AutomatedTLearner = addAutomatedML(TLearner)\n", + "T_learner_aml = AutomatedTLearner(models = automl_config_reg)\n", + "# Train T_learner\n", + "T_learner_aml.fit(Y, T, X)\n", + "# Estimate treatment effects on test data\n", + "T_te_aml = T_learner_aml.effect(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Visualization of bias distribution\n", + "plt.violinplot([np.abs(T_te_rf - expected_te),\n", + " np.abs(T_te_aml - expected_te)\n", + " ], showmeans=True)\n", + "plt.ylabel(\"Bias distribution\")\n", + "plt.xticks([1, 2], ['Grid Search', 'AutoML'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# S Learner" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate S learner\n", + "overall_model = grid_search_reg()\n", + "S_learner_rf = SLearner(overall_model)\n", + "# Train S_learner\n", + "S_learner_rf.fit(Y, T, X)\n", + "# Estimate treatment effects on test data\n", + "S_te_rf = S_learner_rf.effect(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment overall_model_20_01_27-12_16_18 has started.\n", + "Experiment overall_model_20_01_27-12_16_18 completed.\n" + ] + } + ], + "source": [ + "# Instantiate S learner\n", + "AutomatedSLearner = addAutomatedML(SLearner)\n", + "S_learner_aml = AutomatedSLearner(overall_model= automl_config_reg)\n", + "# Train S_learner\n", + "S_learner_aml.fit(Y, T, X)\n", + "# Estimate treatment effects on test data\n", + "S_te_aml = S_learner_aml.effect(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Visualization of bias distribution\n", + "plt.violinplot([np.abs(S_te_rf - expected_te),\n", + " np.abs(S_te_aml - expected_te)\n", + " ], showmeans=True)\n", + "plt.ylabel(\"Bias distribution\")\n", + "plt.xticks([1, 2], ['Grid Search', 'AutoML'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Domain Adaptation Learner" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "models = grid_search_reg()\n", + "final_models = grid_search_reg()\n", + "DA_learner_rf = DomainAdaptationLearner(models=models, final_models=final_models)\n", + "# Train DA_learner\n", + "DA_learner_rf.fit(Y, T, X)\n", + "# Estimate treatment effects on test data\n", + "DA_rf_te = DA_learner_rf.effect(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment models_20_01_27-13_19_40 has started.\n", + "Experiment models_20_01_27-13_19_40 completed.\n", + "Experiment models_20_01_27-13_21_20 has started.\n", + "Experiment models_20_01_27-13_21_20 completed.\n", + "Experiment final_models_20_01_27-13_23_01 has started.\n", + "Experiment final_models_20_01_27-13_23_01 completed.\n" + ] + } + ], + "source": [ + "models = automl_config_reg\n", + "final_models = automl_config_reg\n", + "AutomatedDomainAdaptationLearner = addAutomatedML(DomainAdaptationLearner)\n", + "DA_learner_aml = AutomatedDomainAdaptationLearner(models=models, final_models=final_models)\n", + "# Train X_learner\n", + "DA_learner_aml.fit(Y, T, X)\n", + "# Estimate treatment effects on test data\n", + "DA_te_aml = DA_learner_aml.effect(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Visualization of bias distribution\n", + "plt.violinplot([np.abs(DA_rf_te - expected_te),\n", + " np.abs(DA_te_aml - expected_te)\n", + " ], showmeans=True)\n", + "plt.ylabel(\"Bias distribution\")\n", + "plt.xticks([1, 2], ['Grid Search', 'AutoML'])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_t_20_01_27-17_24_47 has started.\n", + "Experiment model_t_20_01_27-17_24_47 completed.\n", + "Experiment model_y_20_01_27-17_27_27 has started.\n", + "Experiment model_y_20_01_27-17_27_27 completed.\n", + "Experiment model_t_20_01_27-17_29_51 has started.\n", + "Experiment model_t_20_01_27-17_29_51 completed.\n", + "Experiment model_y_20_01_27-17_32_20 has started.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING - Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ProtocolError('Connection aborted.', OSError(\"(10054, 'WSAECONNRESET')\",))': /azureml/ExperimentRun/dcid.AutoML_ddd4742b-4d4c-46bc-b036-e42e9a60b9a1_2/outputs/scoring_file_v_1_0_0.py?sv=2019-02-02&sr=b&sig=Z%2FSYw75Z5M38OuM6L4l3run1WpF%2BB6tr5%2FIdiJkJumM%3D&st=2020-01-27T22%3A24%3A24Z&se=2020-01-28T22%3A34%3A24Z&sp=rcw&comp=block&blockid=TURBd01EQXdNREF3TURBd01EQXdNREF3TURBd01EQXdNREF3TURBd01EQSUzRA%3D%3D&timeout=30\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment model_y_20_01_27-17_32_20 completed.\n", + "Experiment model_final_20_01_27-17_34_55 has started.\n", + "Experiment model_final_20_01_27-17_34_55 completed.\n" + ] + } + ], + "source": [ + "Y, T, X, _ = ihdp_surface_B()\n", + "est = AutomatedNonParamDMLCateEstimator(model_y=automl_config_reg,\n", + " model_t=automl_config_clf,\n", + " model_final=automl_config_reg, featurizer=None,\n", + " discrete_treatment=True)\n", + "est.fit(Y, T, X)\n", + "_ = est.effect(X)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "anumamah" + } + ], + "category": "tutorial", + "compute": [ + "AML" + ], + "datasets": [ + "Bankmarketing" + ], + "deployment": [ + "ACI" + ], + "exclude_from_index": false, + "framework": [ + "None" + ], + "friendly_name": "Automated ML run with basic edition features.", + "index_order": 5, + "kernelspec": { + "display_name": "Python (azure_automl)", + "language": "python", + "name": "azure_automl" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "tags": [ + "featurization", + "explainability", + "remote_run", + "AutomatedML" + ], + "task": "Classification" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.cfg b/setup.cfg index 8eea03541..a4ba60d0b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,12 +31,12 @@ classifiers = Operating System :: OS Independent [options] -packages = find_namespace: +packages = find_namespace: setup_requires = pytest-runner sphinx sphinx_rtd_theme -install_requires = +install_requires = numpy scipy scikit-learn ~= 0.21.0 @@ -50,16 +50,21 @@ install_requires = matplotlib < 3.1 test_suite = econml.tests tests_require = - pytest + pytest pytest-xdist pytest-cov pandas jupyter seaborn +[options.extras_require] +automl = + azureml-sdk[explain,automl] == 1.0.83 + azure-cli + ; TODO: exclude tests? [options.packages.find] -include = +include = econml econml.* @@ -72,6 +77,7 @@ addopts = --junitxml=junit/test-results.xml -n auto --strict --cov-config=setup. markers = slow notebook + automl ; coverage configuration [coverage:run]