diff --git a/doc/source/conf.py b/doc/source/conf.py index 44df6fb..b5b6df2 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -38,6 +38,7 @@ "pandas", "dataclasses_json", "tabulate", + "scipy", ] templates_path = ["_templates"] diff --git a/doc/source/ope/index.rst b/doc/source/ope/index.rst index 1c46b66..808bce5 100644 --- a/doc/source/ope/index.rst +++ b/doc/source/ope/index.rst @@ -7,8 +7,10 @@ Roadmap - [x] Implement Inverse Probability Weighting (IPW) estimator - [x] Implement Self-Normalized Inverse Probability Weighting (SNIPW) estimator - [x] Implement Direct Method (DM) estimator -- [X] Implement Trajectory-wise Importance Sampling (TIS) estimator +- [x] Implement Trajectory-wise Importance Sampling (TIS) estimator +- [ ] Implement Self-Normalized Trajectory-wise Importance Sampling (SNTIS) estimator - [ ] Implement Per-Decision Importance Sampling (PDIS) estimator +- [ ] Implement Self-Normalized Per-Decision Importance Sampling (SNPDIS) estimator - [ ] Implement Doubly Robust (DR) estimator Implemented estimators diff --git a/doc/source/ops/index.rst b/doc/source/ops/index.rst index 7a68399..a0f4840 100644 --- a/doc/source/ops/index.rst +++ b/doc/source/ops/index.rst @@ -1,6 +1,15 @@ Hopes: Selection ================ +Roadmap +------- + +- [x] Confidence Interval estimation using Bootstrap +- [x] Confidence Interval estimation using t-test + +Introduction +------------ + Running an Off-Policy Evaluation (OPE) experiment and then a selection of the best policies with Hopes is simple. Example with a synthetic, random, dataset. @@ -59,6 +68,16 @@ This should produce an output similar to: SNIPW 0.499158 0.00523288 0.490235 0.507513 ===== ======== ========== ============= ============= +Note that confidence interval (CI) calculation can be based on several methods: + +- `bootstrap` (default) +- `t-test` + +The documentation of the CI calculation can be found in +:meth:`hopes.ope.estimators.BaseEstimator.estimate_policy_value_with_confidence_interval`. + + + Classes documentation --------------------- diff --git a/hopes/ope/estimators.py b/hopes/ope/estimators.py index c6aaf06..270e39f 100644 --- a/hopes/ope/estimators.py +++ b/hopes/ope/estimators.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod import numpy as np +import scipy from hopes.assert_utils import check_array from hopes.dev_utils import override @@ -128,13 +129,28 @@ def check_parameters(self) -> None: def estimate_policy_value_with_confidence_interval( self, - num_samples: int = 1000, + method: str = "bootstrap", significance_level: float = 0.05, + num_samples: int = 1000, ) -> dict[str, float]: - """Estimate the confidence interval of the policy value. + r"""Estimate the confidence interval of the policy value. + + The `bootstrap` method uses bootstrapping to estimate the confidence interval of the policy value. Bootstrapping + consists in resampling the data with replacement to infer the distribution of the estimated weighted rewards. + The confidence interval is then computed as the quantiles of the bootstrapped samples. + + The `t-test` method (or `Student's t-test`) uses the t-distribution of the estimated weighted rewards - assuming + that the weighted rewards are normally distributed - to estimate the confidence interval of the policy value. + It follows the t-distribution formula :math:`t = \frac{\hat{\mu} - \mu}{\hat{\sigma} / \sqrt{n}}`, where + :math:`\hat{\mu}` is the mean of the weighted rewards, :math:`\mu` is the true mean of the weighted rewards, + :math:`\hat{\sigma}` is the standard deviation of the weighted rewards, and :math:`n` is the number of samples. + The confidence interval is then computed as: + + .. math:: + [\hat{\mu} - t_{\mathrm{test}}(1 - \alpha, n-1) \frac{\hat{\sigma}}{\sqrt{n}}, + \hat{\mu} + t_{\mathrm{test}}(1 - \alpha, n-1) \frac{\hat{\sigma}}{\sqrt{n}}] - This method uses bootstrapping to estimate the confidence interval of the policy value. The input data is - sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`. + The input data is sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`. Example: @@ -147,7 +163,7 @@ def estimate_policy_value_with_confidence_interval( rewards=rewards, ) metrics = ipw.estimate_policy_value_with_confidence_interval( - num_samples=1000, significance_level=0.05 + method="bootstrap", significance_level=0.05 ) print(metrics) @@ -162,8 +178,10 @@ def estimate_policy_value_with_confidence_interval( "std": 0.4, } - :param num_samples: the number of bootstrap samples to use. + :param method: the method to use for estimating the confidence interval. Currently, only "bootstrap" and + "t-test" are supported. :param significance_level: the significance level of the confidence interval. + :param num_samples: the number of bootstrap samples to use. Only used when `method` is "bootstrap". :return: a dictionary containing the confidence interval of the policy value. The keys are: - "lower_bound": the lower bound of the policy value, given the significance level. @@ -171,26 +189,50 @@ def estimate_policy_value_with_confidence_interval( - "mean": the mean of the policy value. - "std": the standard deviation of the policy value. """ + assert method in ["bootstrap", "t-test"], "The method must be 'bootstrap' or 't-test'." + assert 0 < significance_level < 1, "The significance level must be in (0, 1)." + weighted_rewards = self.estimate_weighted_rewards() assert ( weighted_rewards is not None and len(weighted_rewards) > 0 ), "The weighted rewards must not be empty." weighted_rewards = weighted_rewards.reshape(-1) - boot_samples = [ - np.mean(np.random.choice(weighted_rewards, size=weighted_rewards.shape[0])) - for _ in np.arange(num_samples) - ] - - lower_bound = np.quantile(boot_samples, significance_level / 2) - upper_bound = np.quantile(boot_samples, 1 - significance_level / 2) - - return { - "lower_bound": lower_bound, - "upper_bound": upper_bound, - "mean": np.mean(boot_samples), - "std": np.std(boot_samples), - } + + if method == "bootstrap": + boot_samples = [ + np.mean( + np.random.choice(weighted_rewards, size=weighted_rewards.shape[0], replace=True) + ) + for _ in np.arange(num_samples) + ] + + lower_bound = np.quantile(boot_samples, significance_level / 2) + upper_bound = np.quantile(boot_samples, 1 - significance_level / 2) + + return { + "lower_bound": lower_bound, + "upper_bound": upper_bound, + "mean": np.mean(boot_samples), + "std": np.std(boot_samples), + } + + elif method == "t-test": + num_samples = weighted_rewards.shape[0] + mean = np.mean(weighted_rewards) + # compute the standard deviation of the weighted rewards, using degrees of freedom = num_samples - 1 + std = np.std(weighted_rewards, ddof=1) + # compute t, with alpha = significance_level / 2 and degrees of freedom = num_samples - 1 + t = scipy.stats.t.ppf(1 - significance_level / 2, num_samples - 1) + # compute the confidence interval + ci = t * std / np.sqrt(num_samples) + + return { + "lower_bound": mean - ci, + "upper_bound": mean + ci, + "mean": mean, + "std": std, + } def short_name(self) -> str: """Return the short name of the estimator. diff --git a/hopes/ope/evaluation.py b/hopes/ope/evaluation.py index b4b5765..8e85ccd 100644 --- a/hopes/ope/evaluation.py +++ b/hopes/ope/evaluation.py @@ -79,7 +79,8 @@ def __init__( behavior_policy: Policy, estimators: list[BaseEstimator], fail_fast: bool = True, - significance_level: float = 0.05, + ci_method: str = "bootstrap", + ci_significance_level: float = 0.05, ): """Initialize the off-policy evaluation. @@ -89,7 +90,9 @@ def __init__( :param behavior_policy: the behavior policy used to generate the data :param estimators: a list of estimators to use to evaluate the target policy :param fail_fast: whether to stop the evaluation if one estimator fails - :param significance_level: the significance level for the confidence intervals + :param ci_method: the method to use to compute the confidence intervals. Can be + "bootstrap" or "t-test" + :param ci_significance_level: the significance level for the confidence intervals """ assert isinstance(obs, np.ndarray), "obs must be a numpy array" assert len(obs.shape) == 2, "obs must be a 2D array" @@ -101,15 +104,17 @@ def __init__( [isinstance(estimator, BaseEstimator) for estimator in estimators] ), "estimators must be a list of BaseEstimator instances" assert isinstance(fail_fast, bool), "fail_fast must be a boolean" - assert isinstance(significance_level, float), "significance_level must be a float" - assert 0 < significance_level < 1, "significance_level must be in (0, 1)" + assert ci_method in ["bootstrap", "t-test"], "ci_method must be 'bootstrap' or 't-test'" + assert isinstance(ci_significance_level, float), "significance_level must be a float" + assert 0 < ci_significance_level < 1, "significance_level must be in (0, 1)" self.obs = obs self.rewards = rewards self.behavior_policy = behavior_policy self.estimators = estimators self.fail_fast = fail_fast - self.significance_level = significance_level + self.ci_method = ci_method + self.significance_level = ci_significance_level def evaluate(self, target_policy: Policy) -> OffPolicyEvaluationResults: """Run the off-policy evaluation and return the estimated value of the target policy. @@ -132,7 +137,7 @@ def evaluate(self, target_policy: Policy) -> OffPolicyEvaluationResults: ) eval_results = estimator.estimate_policy_value_with_confidence_interval( - significance_level=self.significance_level + method=self.ci_method, significance_level=self.significance_level ) results[estimator.short_name()] = eval_results diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 1b0b467..68cb9b8 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -43,7 +43,8 @@ def test_ope(self): behavior_policy=behavior_policy, estimators=estimators, fail_fast=True, - significance_level=0.1, + ci_method="bootstrap", + ci_significance_level=0.1, ) results = ope.evaluate(target_policy) self.assertEqual(0.1, results.significance_level) @@ -95,7 +96,8 @@ def test_ops(self): behavior_policy=behavior_policy, estimators=estimators, fail_fast=True, - significance_level=0.1, + ci_method="t-test", + ci_significance_level=0.1, ) results = [] @@ -104,8 +106,8 @@ def test_ops(self): top_k_results = OffPolicySelection.select_top_k(results) self.assertEqual(len(top_k_results), 1) - print(top_k_results[0]) + print("Policy selected by mean", top_k_results[0], sep="\n") top_k_results = OffPolicySelection.select_top_k(results, metric="lower_bound") self.assertEqual(len(top_k_results), 1) - print(top_k_results[0]) + print("Policy selected by lower bound", top_k_results[0], sep="\n")