Merge pull request #21 from airboxlab/ci-ttest

implement t-test method for CI calculation
airboxlab · Apr 19, 2024 · b75291a · b75291a
2 parents 62c8138 + 40bc586
commit b75291a
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 31 deletions.
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -38,6 +38,7 @@
     "pandas",
     "dataclasses_json",
     "tabulate",
+    "scipy",
 ]
 
 templates_path = ["_templates"]

diff --git a/doc/source/ope/index.rst b/doc/source/ope/index.rst
@@ -7,8 +7,10 @@ Roadmap
 - [x] Implement Inverse Probability Weighting (IPW) estimator
 - [x] Implement Self-Normalized Inverse Probability Weighting (SNIPW) estimator
 - [x] Implement Direct Method (DM) estimator
-- [X] Implement Trajectory-wise Importance Sampling (TIS) estimator
+- [x] Implement Trajectory-wise Importance Sampling (TIS) estimator
+- [ ] Implement Self-Normalized Trajectory-wise Importance Sampling (SNTIS) estimator
 - [ ] Implement Per-Decision Importance Sampling (PDIS) estimator
+- [ ] Implement Self-Normalized Per-Decision Importance Sampling (SNPDIS) estimator
 - [ ] Implement Doubly Robust (DR) estimator
 
 Implemented estimators

diff --git a/doc/source/ops/index.rst b/doc/source/ops/index.rst
@@ -1,6 +1,15 @@
 Hopes: Selection
 ================
 
+Roadmap
+-------
+
+- [x] Confidence Interval estimation using Bootstrap
+- [x] Confidence Interval estimation using t-test
+
+Introduction
+------------
+
 Running an Off-Policy Evaluation (OPE) experiment and then a selection of the best policies with Hopes is simple.
 
 Example with a synthetic, random, dataset.
@@ -59,6 +68,16 @@ This should produce an output similar to:
     SNIPW  0.499158  0.00523288       0.490235       0.507513
     =====  ========  ==========  =============  =============
 
+Note that confidence interval (CI) calculation can be based on several methods:
+
+- `bootstrap` (default)
+- `t-test`
+
+The documentation of the CI calculation can be found in
+:meth:`hopes.ope.estimators.BaseEstimator.estimate_policy_value_with_confidence_interval`.
+
+
+
 Classes documentation
 ---------------------
 

diff --git a/hopes/ope/estimators.py b/hopes/ope/estimators.py
@@ -2,6 +2,7 @@
 from abc import ABC, abstractmethod
 
 import numpy as np
+import scipy
 
 from hopes.assert_utils import check_array
 from hopes.dev_utils import override
@@ -128,13 +129,28 @@ def check_parameters(self) -> None:
 
     def estimate_policy_value_with_confidence_interval(
         self,
-        num_samples: int = 1000,
+        method: str = "bootstrap",
         significance_level: float = 0.05,
+        num_samples: int = 1000,
     ) -> dict[str, float]:
-        """Estimate the confidence interval of the policy value.
+        r"""Estimate the confidence interval of the policy value.
+
+        The `bootstrap` method uses bootstrapping to estimate the confidence interval of the policy value. Bootstrapping
+        consists in resampling the data with replacement to infer the distribution of the estimated weighted rewards.
+        The confidence interval is then computed as the quantiles of the bootstrapped samples.
+
+        The `t-test` method (or `Student's t-test`) uses the t-distribution of the estimated weighted rewards - assuming
+        that the weighted rewards are normally distributed - to estimate the confidence interval of the policy value.
+        It follows the t-distribution formula :math:`t = \frac{\hat{\mu} - \mu}{\hat{\sigma} / \sqrt{n}}`, where
+        :math:`\hat{\mu}` is the mean of the weighted rewards, :math:`\mu` is the true mean of the weighted rewards,
+        :math:`\hat{\sigma}` is the standard deviation of the weighted rewards, and :math:`n` is the number of samples.
+        The confidence interval is then computed as:
+
+        .. math::
+            [\hat{\mu} - t_{\mathrm{test}}(1 - \alpha, n-1) \frac{\hat{\sigma}}{\sqrt{n}},
+            \hat{\mu} + t_{\mathrm{test}}(1 - \alpha, n-1) \frac{\hat{\sigma}}{\sqrt{n}}]
 
-        This method uses bootstrapping to estimate the confidence interval of the policy value. The input data is
-        sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`.
+        The input data is sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`.
 
         Example:
 
@@ -147,7 +163,7 @@ def estimate_policy_value_with_confidence_interval(
                 rewards=rewards,
             )
             metrics = ipw.estimate_policy_value_with_confidence_interval(
-                num_samples=1000, significance_level=0.05
+                method="bootstrap", significance_level=0.05
             )
             print(metrics)
 
@@ -162,35 +178,61 @@ def estimate_policy_value_with_confidence_interval(
                 "std": 0.4,
             }
 
-        :param num_samples: the number of bootstrap samples to use.
+        :param method: the method to use for estimating the confidence interval. Currently, only "bootstrap" and
+            "t-test" are supported.
         :param significance_level: the significance level of the confidence interval.
+        :param num_samples: the number of bootstrap samples to use. Only used when `method` is "bootstrap".
         :return: a dictionary containing the confidence interval of the policy value. The keys are:
 
             - "lower_bound": the lower bound of the policy value, given the significance level.
             - "upper_bound": the upper bound of the policy value, given the significance level.
             - "mean": the mean of the policy value.
             - "std": the standard deviation of the policy value.
         """
+        assert method in ["bootstrap", "t-test"], "The method must be 'bootstrap' or 't-test'."
+        assert 0 < significance_level < 1, "The significance level must be in (0, 1)."
+
         weighted_rewards = self.estimate_weighted_rewards()
         assert (
             weighted_rewards is not None and len(weighted_rewards) > 0
         ), "The weighted rewards must not be empty."
 
         weighted_rewards = weighted_rewards.reshape(-1)
-        boot_samples = [
-            np.mean(np.random.choice(weighted_rewards, size=weighted_rewards.shape[0]))
-            for _ in np.arange(num_samples)
-        ]
-
-        lower_bound = np.quantile(boot_samples, significance_level / 2)
-        upper_bound = np.quantile(boot_samples, 1 - significance_level / 2)
-
-        return {
-            "lower_bound": lower_bound,
-            "upper_bound": upper_bound,
-            "mean": np.mean(boot_samples),
-            "std": np.std(boot_samples),
-        }
+
+        if method == "bootstrap":
+            boot_samples = [
+                np.mean(
+                    np.random.choice(weighted_rewards, size=weighted_rewards.shape[0], replace=True)
+                )
+                for _ in np.arange(num_samples)
+            ]
+
+            lower_bound = np.quantile(boot_samples, significance_level / 2)
+            upper_bound = np.quantile(boot_samples, 1 - significance_level / 2)
+
+            return {
+                "lower_bound": lower_bound,
+                "upper_bound": upper_bound,
+                "mean": np.mean(boot_samples),
+                "std": np.std(boot_samples),
+            }
+
+        elif method == "t-test":
+            num_samples = weighted_rewards.shape[0]
+            mean = np.mean(weighted_rewards)
+            # compute the standard deviation of the weighted rewards, using degrees of freedom = num_samples - 1
+            std = np.std(weighted_rewards, ddof=1)
+            # compute t, with alpha = significance_level / 2 and degrees of freedom = num_samples - 1
+            t = scipy.stats.t.ppf(1 - significance_level / 2, num_samples - 1)
+            # compute the confidence interval
+            ci = t * std / np.sqrt(num_samples)
+
+            return {
+                "lower_bound": mean - ci,
+                "upper_bound": mean + ci,
+                "mean": mean,
+                "std": std,
+            }
 
     def short_name(self) -> str:
         """Return the short name of the estimator.

diff --git a/hopes/ope/evaluation.py b/hopes/ope/evaluation.py
@@ -79,7 +79,8 @@ def __init__(
         behavior_policy: Policy,
         estimators: list[BaseEstimator],
         fail_fast: bool = True,
-        significance_level: float = 0.05,
+        ci_method: str = "bootstrap",
+        ci_significance_level: float = 0.05,
     ):
         """Initialize the off-policy evaluation.
 
@@ -89,7 +90,9 @@ def __init__(
         :param behavior_policy: the behavior policy used to generate the data
         :param estimators: a list of estimators to use to evaluate the target policy
         :param fail_fast: whether to stop the evaluation if one estimator fails
-        :param significance_level: the significance level for the confidence intervals
+        :param ci_method: the method to use to compute the confidence intervals. Can be
+            "bootstrap" or "t-test"
+        :param ci_significance_level: the significance level for the confidence intervals
         """
         assert isinstance(obs, np.ndarray), "obs must be a numpy array"
         assert len(obs.shape) == 2, "obs must be a 2D array"
@@ -101,15 +104,17 @@ def __init__(
             [isinstance(estimator, BaseEstimator) for estimator in estimators]
         ), "estimators must be a list of BaseEstimator instances"
         assert isinstance(fail_fast, bool), "fail_fast must be a boolean"
-        assert isinstance(significance_level, float), "significance_level must be a float"
-        assert 0 < significance_level < 1, "significance_level must be in (0, 1)"
+        assert ci_method in ["bootstrap", "t-test"], "ci_method must be 'bootstrap' or 't-test'"
+        assert isinstance(ci_significance_level, float), "significance_level must be a float"
+        assert 0 < ci_significance_level < 1, "significance_level must be in (0, 1)"
 
         self.obs = obs
         self.rewards = rewards
         self.behavior_policy = behavior_policy
         self.estimators = estimators
         self.fail_fast = fail_fast
-        self.significance_level = significance_level
+        self.ci_method = ci_method
+        self.significance_level = ci_significance_level
 
     def evaluate(self, target_policy: Policy) -> OffPolicyEvaluationResults:
         """Run the off-policy evaluation and return the estimated value of the target policy.
@@ -132,7 +137,7 @@ def evaluate(self, target_policy: Policy) -> OffPolicyEvaluationResults:
                 )
 
                 eval_results = estimator.estimate_policy_value_with_confidence_interval(
-                    significance_level=self.significance_level
+                    method=self.ci_method, significance_level=self.significance_level
                 )
                 results[estimator.short_name()] = eval_results
 

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -43,7 +43,8 @@ def test_ope(self):
             behavior_policy=behavior_policy,
             estimators=estimators,
             fail_fast=True,
-            significance_level=0.1,
+            ci_method="bootstrap",
+            ci_significance_level=0.1,
         )
         results = ope.evaluate(target_policy)
         self.assertEqual(0.1, results.significance_level)
@@ -95,7 +96,8 @@ def test_ops(self):
             behavior_policy=behavior_policy,
             estimators=estimators,
             fail_fast=True,
-            significance_level=0.1,
+            ci_method="t-test",
+            ci_significance_level=0.1,
         )
 
         results = []
@@ -104,8 +106,8 @@ def test_ops(self):
 
         top_k_results = OffPolicySelection.select_top_k(results)
         self.assertEqual(len(top_k_results), 1)
-        print(top_k_results[0])
+        print("Policy selected by mean", top_k_results[0], sep="\n")
 
         top_k_results = OffPolicySelection.select_top_k(results, metric="lower_bound")
         self.assertEqual(len(top_k_results), 1)
-        print(top_k_results[0])
+        print("Policy selected by lower bound", top_k_results[0], sep="\n")