Merge pull request #20 from airboxlab/tis

implement Trajectory-wise Importance Sampling
airboxlab · Apr 19, 2024 · 62c8138 · 62c8138
2 parents 22b860b + 4f732f6
commit 62c8138
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 8 deletions.
diff --git a/doc/source/ope/index.rst b/doc/source/ope/index.rst
@@ -7,9 +7,9 @@ Roadmap
 - [x] Implement Inverse Probability Weighting (IPW) estimator
 - [x] Implement Self-Normalized Inverse Probability Weighting (SNIPW) estimator
 - [x] Implement Direct Method (DM) estimator
-- [ ] Implement Doubly Robust (DR) estimator
-- [ ] Implement Trajectory-Wise Importance Sampling (TWIS) estimator
+- [X] Implement Trajectory-wise Importance Sampling (TIS) estimator
 - [ ] Implement Per-Decision Importance Sampling (PDIS) estimator
+- [ ] Implement Doubly Robust (DR) estimator
 
 Implemented estimators
 -----------------------
@@ -23,6 +23,7 @@ Currently, the following estimators are implemented:
    hopes.ope.estimators.InverseProbabilityWeighting
    hopes.ope.estimators.SelfNormalizedInverseProbabilityWeighting
    hopes.ope.estimators.DirectMethod
+   hopes.ope.estimators.TrajectoryWiseImportanceSampling
 
 Estimators documentation
 ------------------------
@@ -42,6 +43,11 @@ Estimators documentation
     :undoc-members:
     :show-inheritance:
 
+.. autoclass:: hopes.ope.estimators.TrajectoryWiseImportanceSampling
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 Implementing a new estimator
 ----------------------------
 

diff --git a/hopes/ope/estimators.py b/hopes/ope/estimators.py
@@ -424,3 +424,91 @@ def estimate_weighted_rewards(self) -> np.ndarray:
     def estimate_policy_value(self) -> float:
         """Estimate the value of the target policy using the Direct Method estimator."""
         return np.mean(self.estimate_weighted_rewards())
+
+
+class TrajectoryWiseImportanceSampling(BaseEstimator):
+    r"""Trajectory-wise Importance Sampling (TIS) estimator.
+
+    :math:`V_{TIS} (\pi_e, D) = \frac {1}{n} \sum_{i=1}^n\sum_{t=0}^{T-1}\gamma^t w^{(i)}_{0:T-1} r_t^{(i)}`
+
+    Where:
+
+    - :math:`D` is the offline collected dataset.
+    - :math:`w^{(i)}_{0:T-1}` is the importance weight of the trajectory :math:`i` defined as :math:`w_{0:T-1} = \prod_{t=0}^{T-1} \frac {\pi_e(a_t|s_t)} {\pi_b(a_t|s_t)}`
+    - :math:`\pi_e` is the target policy and :math:`\pi_b` is the behavior policy.
+    - :math:`n` is the number of trajectories.
+    - :math:`T` is the length of the trajectory.
+    - :math:`\gamma_t` is the discount factor at time :math:`t`.
+    - :math:`r_t^{(i)}` is the reward at time :math:`t` of trajectory :math:`i`.
+
+    TIS can suffer from high variance due to the product operation of the importance weights.
+
+    References:
+        https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1079&context=cs_faculty_pubs
+    """
+
+    def __init__(self, steps_per_episode: int, discount_factor: float = 1.0) -> None:
+        super().__init__()
+
+        assert steps_per_episode > 0, "The number of steps per episode must be positive."
+        assert 0 <= discount_factor <= 1, "The discount factor must be in [0, 1]."
+
+        self.steps_per_episode = steps_per_episode
+        self.discount_factor = discount_factor
+
+    @override(BaseEstimator)
+    def short_name(self) -> str:
+        return "TIS"
+
+    @override(BaseEstimator)
+    def check_parameters(self) -> None:
+        """Check if the estimator parameters are valid."""
+        super().check_parameters()
+
+        assert (
+            self.target_policy_action_probabilities.shape[0] % self.steps_per_episode == 0
+        ), "The number of samples must be divisible by the number of steps per episode."
+
+    @override(BaseEstimator)
+    def estimate_weighted_rewards(self) -> np.ndarray:
+        """Estimate the weighted rewards using the Trajectory-wise Importance Sampling estimator.
+
+        :return: the weighted rewards, or here the policy value per trajectory.
+        """
+        self.check_parameters()
+
+        num_actions = self.target_policy_action_probabilities.shape[1]
+
+        # compute product of importance weights per trajectory
+        importance_weights = (
+            self.target_policy_action_probabilities / self.behavior_policy_action_probabilities
+        )
+        # shape: (n, T * num_actions)
+        importance_weights = importance_weights.reshape(-1, self.steps_per_episode * num_actions)
+        # shape: (n, 1)
+        importance_weights = np.prod(importance_weights, axis=1).reshape(-1, 1)
+
+        # rewards, shape: (n, T)
+        rewards = self.rewards.reshape(-1, self.steps_per_episode)
+
+        # discount factors
+        # make a matrix of discount factors, shape: (n, T)
+        num_trajectories = rewards.shape[0]
+        discount_factors = np.full((num_trajectories, self.steps_per_episode), self.discount_factor)
+        # compute the discount factor at each step as
+        # [gamma^0, gamma^1, ..., gamma^(T-1)] = [gamma^1, gamma^2, ..., gamma^T] / gamma
+        discount_factors = np.cumprod(discount_factors, axis=1) / self.discount_factor
+
+        # compute the weighted rewards per trajectory, shape: (n, 1)
+        weighted_rewards = np.sum(
+            importance_weights * rewards * discount_factors,  # (n, 1) * (n, T) * (n, T)
+            axis=1,  # sum weights over the trajectory length
+        ).reshape(-1, 1)
+
+        return weighted_rewards
+
+    @override(BaseEstimator)
+    def estimate_policy_value(self) -> float:
+        """Estimate the value of the target policy using the Trajectory-wise Importance Sampling
+        estimator."""
+        return np.mean(self.estimate_weighted_rewards())
diff --git a/hopes/rew/rewards.py b/hopes/rew/rewards.py
@@ -78,11 +78,9 @@ def __init__(
             and mlp are supported.
         :param model_params: optional parameters for the reward model.
         """
-        supported_reward_models = ["linear", "polynomial", "mlp", "random_forest"]
+        supported_models = ["linear", "polynomial", "mlp", "random_forest"]
 
-        assert (
-            regression_model in supported_reward_models
-        ), f"Only {supported_reward_models} supported for now."
+        assert regression_model in supported_models, f"Only {supported_models} supported for now."
         assert obs.ndim == 2, "Observations must have shape (batch_size, obs_dim)."
         assert (
             obs.shape[0] == act.shape[0] == rew.shape[0]
@@ -91,13 +89,15 @@ def __init__(
         self.obs = obs
         self.act = act.reshape(-1, 1) if act.ndim == 1 else act
         self.rew = rew.reshape(-1, 1) if rew.ndim == 1 else rew
+
+        # model configuration
         self.model_params = model_params or {}
         self.regression_model = regression_model
         self.poly_features = None
 
         # both linear and polynomial models are implemented using sklearn LinearRegression
         # for polynomial model, we use PolynomialFeatures to generate polynomial features then fit the linear model
-        if self.regression_model == "linear" or self.regression_model == "polynomial":
+        if self.regression_model in ["linear", "polynomial"]:
             self.model = LinearRegression()
 
         # mlp model is implemented using torch. We use a simple feedforward neural network and MSE loss.

diff --git a/tests/test_estimators.py b/tests/test_estimators.py
@@ -8,6 +8,7 @@
     DirectMethod,
     InverseProbabilityWeighting,
     SelfNormalizedInverseProbabilityWeighting,
+    TrajectoryWiseImportanceSampling,
 )
 from hopes.rew.rewards import RegressionBasedRewardModel
 
@@ -131,6 +132,47 @@ def test_dm(self):
 
         self._test_ci(dm)
 
+    def test_tis(self):
+        traj_length = 10
+        num_episodes = 5
+        num_actions = 3
+
+        tis = TrajectoryWiseImportanceSampling(
+            steps_per_episode=traj_length,
+            discount_factor=0.99,
+        )
+
+        target_policy_action_probabilities = np.concatenate(
+            [
+                generate_action_probs(traj_length=traj_length, num_actions=num_actions)
+                for _ in range(num_episodes)
+            ]
+        )
+        behavior_policy_action_probabilities = np.concatenate(
+            [
+                generate_action_probs(traj_length=traj_length, num_actions=num_actions)
+                for _ in range(num_episodes)
+            ]
+        )
+        rewards = np.random.rand(traj_length * num_episodes)
+
+        tis.set_parameters(
+            target_policy_action_probabilities=target_policy_action_probabilities,
+            behavior_policy_action_probabilities=behavior_policy_action_probabilities,
+            rewards=rewards,
+        )
+
+        wrew = tis.estimate_weighted_rewards()
+        self.assertIsInstance(wrew, np.ndarray)
+        self.assertEqual(wrew.shape, (5, 1))
+
+        policy_value = tis.estimate_policy_value()
+        self.assertIsInstance(policy_value, float)
+        self.assertGreaterEqual(policy_value, 0.0)
+
+        # test CI
+        self._test_ci(tis)
+
     def test_neg_rewards(self):
         ipw = InverseProbabilityWeighting()
 

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -5,6 +5,7 @@
 from hopes.ope.estimators import (
     InverseProbabilityWeighting,
     SelfNormalizedInverseProbabilityWeighting,
+    TrajectoryWiseImportanceSampling,
 )
 from hopes.ope.evaluation import OffPolicyEvaluation
 from hopes.ope.selection import OffPolicySelection
@@ -57,8 +58,9 @@ def test_ope(self):
 
     def test_ops(self):
         num_actions = 3
-        num_obs = 5
+        num_obs = 50
         num_samples = 1000
+        steps_per_episode = 10
         obs = np.random.rand(num_samples, num_obs)
         act = np.random.randint(num_actions, size=num_samples)
         rew = np.random.rand(num_samples)
@@ -81,6 +83,9 @@ def test_ops(self):
         estimators = [
             InverseProbabilityWeighting(),
             SelfNormalizedInverseProbabilityWeighting(),
+            TrajectoryWiseImportanceSampling(
+                steps_per_episode=steps_per_episode, discount_factor=0.99
+            ),
         ]
 
         # run the off-policy evaluation