Skip to content

Commit

Permalink
Merge pull request #21 from airboxlab/ci-ttest
Browse files Browse the repository at this point in the history
implement t-test method for CI calculation
  • Loading branch information
antoine-galataud authored Apr 19, 2024
2 parents 62c8138 + 40bc586 commit b75291a
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 31 deletions.
1 change: 1 addition & 0 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"pandas",
"dataclasses_json",
"tabulate",
"scipy",
]

templates_path = ["_templates"]
Expand Down
4 changes: 3 additions & 1 deletion doc/source/ope/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ Roadmap
- [x] Implement Inverse Probability Weighting (IPW) estimator
- [x] Implement Self-Normalized Inverse Probability Weighting (SNIPW) estimator
- [x] Implement Direct Method (DM) estimator
- [X] Implement Trajectory-wise Importance Sampling (TIS) estimator
- [x] Implement Trajectory-wise Importance Sampling (TIS) estimator
- [ ] Implement Self-Normalized Trajectory-wise Importance Sampling (SNTIS) estimator
- [ ] Implement Per-Decision Importance Sampling (PDIS) estimator
- [ ] Implement Self-Normalized Per-Decision Importance Sampling (SNPDIS) estimator
- [ ] Implement Doubly Robust (DR) estimator

Implemented estimators
Expand Down
19 changes: 19 additions & 0 deletions doc/source/ops/index.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Hopes: Selection
================

Roadmap
-------

- [x] Confidence Interval estimation using Bootstrap
- [x] Confidence Interval estimation using t-test

Introduction
------------

Running an Off-Policy Evaluation (OPE) experiment and then a selection of the best policies with Hopes is simple.

Example with a synthetic, random, dataset.
Expand Down Expand Up @@ -59,6 +68,16 @@ This should produce an output similar to:
SNIPW 0.499158 0.00523288 0.490235 0.507513
===== ======== ========== ============= =============
Note that confidence interval (CI) calculation can be based on several methods:

- `bootstrap` (default)
- `t-test`

The documentation of the CI calculation can be found in
:meth:`hopes.ope.estimators.BaseEstimator.estimate_policy_value_with_confidence_interval`.



Classes documentation
---------------------

Expand Down
82 changes: 62 additions & 20 deletions hopes/ope/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import ABC, abstractmethod

import numpy as np
import scipy

from hopes.assert_utils import check_array
from hopes.dev_utils import override
Expand Down Expand Up @@ -128,13 +129,28 @@ def check_parameters(self) -> None:

def estimate_policy_value_with_confidence_interval(
self,
num_samples: int = 1000,
method: str = "bootstrap",
significance_level: float = 0.05,
num_samples: int = 1000,
) -> dict[str, float]:
"""Estimate the confidence interval of the policy value.
r"""Estimate the confidence interval of the policy value.
The `bootstrap` method uses bootstrapping to estimate the confidence interval of the policy value. Bootstrapping
consists in resampling the data with replacement to infer the distribution of the estimated weighted rewards.
The confidence interval is then computed as the quantiles of the bootstrapped samples.
The `t-test` method (or `Student's t-test`) uses the t-distribution of the estimated weighted rewards - assuming
that the weighted rewards are normally distributed - to estimate the confidence interval of the policy value.
It follows the t-distribution formula :math:`t = \frac{\hat{\mu} - \mu}{\hat{\sigma} / \sqrt{n}}`, where
:math:`\hat{\mu}` is the mean of the weighted rewards, :math:`\mu` is the true mean of the weighted rewards,
:math:`\hat{\sigma}` is the standard deviation of the weighted rewards, and :math:`n` is the number of samples.
The confidence interval is then computed as:
.. math::
[\hat{\mu} - t_{\mathrm{test}}(1 - \alpha, n-1) \frac{\hat{\sigma}}{\sqrt{n}},
\hat{\mu} + t_{\mathrm{test}}(1 - \alpha, n-1) \frac{\hat{\sigma}}{\sqrt{n}}]
This method uses bootstrapping to estimate the confidence interval of the policy value. The input data is
sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`.
The input data is sampled from the estimated weighted rewards, using :meth:`estimate_weighted_rewards`.
Example:
Expand All @@ -147,7 +163,7 @@ def estimate_policy_value_with_confidence_interval(
rewards=rewards,
)
metrics = ipw.estimate_policy_value_with_confidence_interval(
num_samples=1000, significance_level=0.05
method="bootstrap", significance_level=0.05
)
print(metrics)
Expand All @@ -162,35 +178,61 @@ def estimate_policy_value_with_confidence_interval(
"std": 0.4,
}
:param num_samples: the number of bootstrap samples to use.
:param method: the method to use for estimating the confidence interval. Currently, only "bootstrap" and
"t-test" are supported.
:param significance_level: the significance level of the confidence interval.
:param num_samples: the number of bootstrap samples to use. Only used when `method` is "bootstrap".
:return: a dictionary containing the confidence interval of the policy value. The keys are:
- "lower_bound": the lower bound of the policy value, given the significance level.
- "upper_bound": the upper bound of the policy value, given the significance level.
- "mean": the mean of the policy value.
- "std": the standard deviation of the policy value.
"""
assert method in ["bootstrap", "t-test"], "The method must be 'bootstrap' or 't-test'."
assert 0 < significance_level < 1, "The significance level must be in (0, 1)."

weighted_rewards = self.estimate_weighted_rewards()
assert (
weighted_rewards is not None and len(weighted_rewards) > 0
), "The weighted rewards must not be empty."

weighted_rewards = weighted_rewards.reshape(-1)
boot_samples = [
np.mean(np.random.choice(weighted_rewards, size=weighted_rewards.shape[0]))
for _ in np.arange(num_samples)
]

lower_bound = np.quantile(boot_samples, significance_level / 2)
upper_bound = np.quantile(boot_samples, 1 - significance_level / 2)

return {
"lower_bound": lower_bound,
"upper_bound": upper_bound,
"mean": np.mean(boot_samples),
"std": np.std(boot_samples),
}

if method == "bootstrap":
boot_samples = [
np.mean(
np.random.choice(weighted_rewards, size=weighted_rewards.shape[0], replace=True)
)
for _ in np.arange(num_samples)
]

lower_bound = np.quantile(boot_samples, significance_level / 2)
upper_bound = np.quantile(boot_samples, 1 - significance_level / 2)

return {
"lower_bound": lower_bound,
"upper_bound": upper_bound,
"mean": np.mean(boot_samples),
"std": np.std(boot_samples),
}

elif method == "t-test":
num_samples = weighted_rewards.shape[0]
mean = np.mean(weighted_rewards)
# compute the standard deviation of the weighted rewards, using degrees of freedom = num_samples - 1
std = np.std(weighted_rewards, ddof=1)
# compute t, with alpha = significance_level / 2 and degrees of freedom = num_samples - 1
t = scipy.stats.t.ppf(1 - significance_level / 2, num_samples - 1)
# compute the confidence interval
ci = t * std / np.sqrt(num_samples)

return {
"lower_bound": mean - ci,
"upper_bound": mean + ci,
"mean": mean,
"std": std,
}

def short_name(self) -> str:
"""Return the short name of the estimator.
Expand Down
17 changes: 11 additions & 6 deletions hopes/ope/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def __init__(
behavior_policy: Policy,
estimators: list[BaseEstimator],
fail_fast: bool = True,
significance_level: float = 0.05,
ci_method: str = "bootstrap",
ci_significance_level: float = 0.05,
):
"""Initialize the off-policy evaluation.
Expand All @@ -89,7 +90,9 @@ def __init__(
:param behavior_policy: the behavior policy used to generate the data
:param estimators: a list of estimators to use to evaluate the target policy
:param fail_fast: whether to stop the evaluation if one estimator fails
:param significance_level: the significance level for the confidence intervals
:param ci_method: the method to use to compute the confidence intervals. Can be
"bootstrap" or "t-test"
:param ci_significance_level: the significance level for the confidence intervals
"""
assert isinstance(obs, np.ndarray), "obs must be a numpy array"
assert len(obs.shape) == 2, "obs must be a 2D array"
Expand All @@ -101,15 +104,17 @@ def __init__(
[isinstance(estimator, BaseEstimator) for estimator in estimators]
), "estimators must be a list of BaseEstimator instances"
assert isinstance(fail_fast, bool), "fail_fast must be a boolean"
assert isinstance(significance_level, float), "significance_level must be a float"
assert 0 < significance_level < 1, "significance_level must be in (0, 1)"
assert ci_method in ["bootstrap", "t-test"], "ci_method must be 'bootstrap' or 't-test'"
assert isinstance(ci_significance_level, float), "significance_level must be a float"
assert 0 < ci_significance_level < 1, "significance_level must be in (0, 1)"

self.obs = obs
self.rewards = rewards
self.behavior_policy = behavior_policy
self.estimators = estimators
self.fail_fast = fail_fast
self.significance_level = significance_level
self.ci_method = ci_method
self.significance_level = ci_significance_level

def evaluate(self, target_policy: Policy) -> OffPolicyEvaluationResults:
"""Run the off-policy evaluation and return the estimated value of the target policy.
Expand All @@ -132,7 +137,7 @@ def evaluate(self, target_policy: Policy) -> OffPolicyEvaluationResults:
)

eval_results = estimator.estimate_policy_value_with_confidence_interval(
significance_level=self.significance_level
method=self.ci_method, significance_level=self.significance_level
)
results[estimator.short_name()] = eval_results

Expand Down
10 changes: 6 additions & 4 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def test_ope(self):
behavior_policy=behavior_policy,
estimators=estimators,
fail_fast=True,
significance_level=0.1,
ci_method="bootstrap",
ci_significance_level=0.1,
)
results = ope.evaluate(target_policy)
self.assertEqual(0.1, results.significance_level)
Expand Down Expand Up @@ -95,7 +96,8 @@ def test_ops(self):
behavior_policy=behavior_policy,
estimators=estimators,
fail_fast=True,
significance_level=0.1,
ci_method="t-test",
ci_significance_level=0.1,
)

results = []
Expand All @@ -104,8 +106,8 @@ def test_ops(self):

top_k_results = OffPolicySelection.select_top_k(results)
self.assertEqual(len(top_k_results), 1)
print(top_k_results[0])
print("Policy selected by mean", top_k_results[0], sep="\n")

top_k_results = OffPolicySelection.select_top_k(results, metric="lower_bound")
self.assertEqual(len(top_k_results), 1)
print(top_k_results[0])
print("Policy selected by lower bound", top_k_results[0], sep="\n")

0 comments on commit b75291a

Please sign in to comment.