Skip to content

Commit

Permalink
Merge pull request #20 from airboxlab/tis
Browse files Browse the repository at this point in the history
implement Trajectory-wise Importance Sampling
  • Loading branch information
antoine-galataud authored Apr 19, 2024
2 parents 22b860b + 4f732f6 commit 62c8138
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 8 deletions.
10 changes: 8 additions & 2 deletions doc/source/ope/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ Roadmap
- [x] Implement Inverse Probability Weighting (IPW) estimator
- [x] Implement Self-Normalized Inverse Probability Weighting (SNIPW) estimator
- [x] Implement Direct Method (DM) estimator
- [ ] Implement Doubly Robust (DR) estimator
- [ ] Implement Trajectory-Wise Importance Sampling (TWIS) estimator
- [X] Implement Trajectory-wise Importance Sampling (TIS) estimator
- [ ] Implement Per-Decision Importance Sampling (PDIS) estimator
- [ ] Implement Doubly Robust (DR) estimator

Implemented estimators
-----------------------
Expand All @@ -23,6 +23,7 @@ Currently, the following estimators are implemented:
hopes.ope.estimators.InverseProbabilityWeighting
hopes.ope.estimators.SelfNormalizedInverseProbabilityWeighting
hopes.ope.estimators.DirectMethod
hopes.ope.estimators.TrajectoryWiseImportanceSampling

Estimators documentation
------------------------
Expand All @@ -42,6 +43,11 @@ Estimators documentation
:undoc-members:
:show-inheritance:

.. autoclass:: hopes.ope.estimators.TrajectoryWiseImportanceSampling
:members:
:undoc-members:
:show-inheritance:

Implementing a new estimator
----------------------------

Expand Down
88 changes: 88 additions & 0 deletions hopes/ope/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,3 +424,91 @@ def estimate_weighted_rewards(self) -> np.ndarray:
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the Direct Method estimator."""
return np.mean(self.estimate_weighted_rewards())


class TrajectoryWiseImportanceSampling(BaseEstimator):
r"""Trajectory-wise Importance Sampling (TIS) estimator.
:math:`V_{TIS} (\pi_e, D) = \frac {1}{n} \sum_{i=1}^n\sum_{t=0}^{T-1}\gamma^t w^{(i)}_{0:T-1} r_t^{(i)}`
Where:
- :math:`D` is the offline collected dataset.
- :math:`w^{(i)}_{0:T-1}` is the importance weight of the trajectory :math:`i` defined as :math:`w_{0:T-1} = \prod_{t=0}^{T-1} \frac {\pi_e(a_t|s_t)} {\pi_b(a_t|s_t)}`
- :math:`\pi_e` is the target policy and :math:`\pi_b` is the behavior policy.
- :math:`n` is the number of trajectories.
- :math:`T` is the length of the trajectory.
- :math:`\gamma_t` is the discount factor at time :math:`t`.
- :math:`r_t^{(i)}` is the reward at time :math:`t` of trajectory :math:`i`.
TIS can suffer from high variance due to the product operation of the importance weights.
References:
https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1079&context=cs_faculty_pubs
"""

def __init__(self, steps_per_episode: int, discount_factor: float = 1.0) -> None:
super().__init__()

assert steps_per_episode > 0, "The number of steps per episode must be positive."
assert 0 <= discount_factor <= 1, "The discount factor must be in [0, 1]."

self.steps_per_episode = steps_per_episode
self.discount_factor = discount_factor

@override(BaseEstimator)
def short_name(self) -> str:
return "TIS"

@override(BaseEstimator)
def check_parameters(self) -> None:
"""Check if the estimator parameters are valid."""
super().check_parameters()

assert (
self.target_policy_action_probabilities.shape[0] % self.steps_per_episode == 0
), "The number of samples must be divisible by the number of steps per episode."

@override(BaseEstimator)
def estimate_weighted_rewards(self) -> np.ndarray:
"""Estimate the weighted rewards using the Trajectory-wise Importance Sampling estimator.
:return: the weighted rewards, or here the policy value per trajectory.
"""
self.check_parameters()

num_actions = self.target_policy_action_probabilities.shape[1]

# compute product of importance weights per trajectory
importance_weights = (
self.target_policy_action_probabilities / self.behavior_policy_action_probabilities
)
# shape: (n, T * num_actions)
importance_weights = importance_weights.reshape(-1, self.steps_per_episode * num_actions)
# shape: (n, 1)
importance_weights = np.prod(importance_weights, axis=1).reshape(-1, 1)

# rewards, shape: (n, T)
rewards = self.rewards.reshape(-1, self.steps_per_episode)

# discount factors
# make a matrix of discount factors, shape: (n, T)
num_trajectories = rewards.shape[0]
discount_factors = np.full((num_trajectories, self.steps_per_episode), self.discount_factor)
# compute the discount factor at each step as
# [gamma^0, gamma^1, ..., gamma^(T-1)] = [gamma^1, gamma^2, ..., gamma^T] / gamma
discount_factors = np.cumprod(discount_factors, axis=1) / self.discount_factor

# compute the weighted rewards per trajectory, shape: (n, 1)
weighted_rewards = np.sum(
importance_weights * rewards * discount_factors, # (n, 1) * (n, T) * (n, T)
axis=1, # sum weights over the trajectory length
).reshape(-1, 1)

return weighted_rewards

@override(BaseEstimator)
def estimate_policy_value(self) -> float:
"""Estimate the value of the target policy using the Trajectory-wise Importance Sampling
estimator."""
return np.mean(self.estimate_weighted_rewards())
10 changes: 5 additions & 5 deletions hopes/rew/rewards.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,9 @@ def __init__(
and mlp are supported.
:param model_params: optional parameters for the reward model.
"""
supported_reward_models = ["linear", "polynomial", "mlp", "random_forest"]
supported_models = ["linear", "polynomial", "mlp", "random_forest"]

assert (
regression_model in supported_reward_models
), f"Only {supported_reward_models} supported for now."
assert regression_model in supported_models, f"Only {supported_models} supported for now."
assert obs.ndim == 2, "Observations must have shape (batch_size, obs_dim)."
assert (
obs.shape[0] == act.shape[0] == rew.shape[0]
Expand All @@ -91,13 +89,15 @@ def __init__(
self.obs = obs
self.act = act.reshape(-1, 1) if act.ndim == 1 else act
self.rew = rew.reshape(-1, 1) if rew.ndim == 1 else rew

# model configuration
self.model_params = model_params or {}
self.regression_model = regression_model
self.poly_features = None

# both linear and polynomial models are implemented using sklearn LinearRegression
# for polynomial model, we use PolynomialFeatures to generate polynomial features then fit the linear model
if self.regression_model == "linear" or self.regression_model == "polynomial":
if self.regression_model in ["linear", "polynomial"]:
self.model = LinearRegression()

# mlp model is implemented using torch. We use a simple feedforward neural network and MSE loss.
Expand Down
42 changes: 42 additions & 0 deletions tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DirectMethod,
InverseProbabilityWeighting,
SelfNormalizedInverseProbabilityWeighting,
TrajectoryWiseImportanceSampling,
)
from hopes.rew.rewards import RegressionBasedRewardModel

Expand Down Expand Up @@ -131,6 +132,47 @@ def test_dm(self):

self._test_ci(dm)

def test_tis(self):
traj_length = 10
num_episodes = 5
num_actions = 3

tis = TrajectoryWiseImportanceSampling(
steps_per_episode=traj_length,
discount_factor=0.99,
)

target_policy_action_probabilities = np.concatenate(
[
generate_action_probs(traj_length=traj_length, num_actions=num_actions)
for _ in range(num_episodes)
]
)
behavior_policy_action_probabilities = np.concatenate(
[
generate_action_probs(traj_length=traj_length, num_actions=num_actions)
for _ in range(num_episodes)
]
)
rewards = np.random.rand(traj_length * num_episodes)

tis.set_parameters(
target_policy_action_probabilities=target_policy_action_probabilities,
behavior_policy_action_probabilities=behavior_policy_action_probabilities,
rewards=rewards,
)

wrew = tis.estimate_weighted_rewards()
self.assertIsInstance(wrew, np.ndarray)
self.assertEqual(wrew.shape, (5, 1))

policy_value = tis.estimate_policy_value()
self.assertIsInstance(policy_value, float)
self.assertGreaterEqual(policy_value, 0.0)

# test CI
self._test_ci(tis)

def test_neg_rewards(self):
ipw = InverseProbabilityWeighting()

Expand Down
7 changes: 6 additions & 1 deletion tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from hopes.ope.estimators import (
InverseProbabilityWeighting,
SelfNormalizedInverseProbabilityWeighting,
TrajectoryWiseImportanceSampling,
)
from hopes.ope.evaluation import OffPolicyEvaluation
from hopes.ope.selection import OffPolicySelection
Expand Down Expand Up @@ -57,8 +58,9 @@ def test_ope(self):

def test_ops(self):
num_actions = 3
num_obs = 5
num_obs = 50
num_samples = 1000
steps_per_episode = 10
obs = np.random.rand(num_samples, num_obs)
act = np.random.randint(num_actions, size=num_samples)
rew = np.random.rand(num_samples)
Expand All @@ -81,6 +83,9 @@ def test_ops(self):
estimators = [
InverseProbabilityWeighting(),
SelfNormalizedInverseProbabilityWeighting(),
TrajectoryWiseImportanceSampling(
steps_per_episode=steps_per_episode, discount_factor=0.99
),
]

# run the off-policy evaluation
Expand Down

0 comments on commit 62c8138

Please sign in to comment.