openai · jkterry1 · Jul 9, 2022 · Apr 14, 2022 · Apr 20, 2022 · Apr 20, 2022
diff --git a/gym/core.py b/gym/core.py
@@ -66,11 +66,16 @@ def np_random(self) -> RandomNumberGenerator:
     def np_random(self, value: RandomNumberGenerator):
         self._np_random = value
 
-    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
+    def step(
+        self, action: ActType
+    ) -> Union[
+        Tuple[ObsType, float, bool, bool, dict], Tuple[ObsType, float, bool, dict]
+    ]:
         """Run one timestep of the environment's dynamics.
 
         When end of episode is reached, you are responsible for calling :meth:`reset` to reset this environment's state.
-        Accepts an action and returns a tuple `(observation, reward, done, info)`.
+        Accepts an action and returns either a tuple `(observation, reward, terminated, truncated, info)`, or a tuple
+        (observation, reward, done, info). The latter is deprecated and will be removed in future versions.
 
         Args:
             action (ActType): an action provided by the agent
@@ -79,14 +84,18 @@ def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
             observation (object): this will be an element of the environment's :attr:`observation_space`.
                 This may, for instance, be a numpy array containing the positions and velocities of certain objects.
             reward (float): The amount of reward returned as a result of taking the action.
-            done (bool): A boolean value for if the episode has ended, in which case further :meth:`step` calls will return undefined results.
-                A done signal may be emitted for different reasons: Maybe the task underlying the environment was solved successfully,
-                a certain timelimit was exceeded, or the physics simulation has entered an invalid state.
+            terminated (bool): whether the episode has ended due to reaching a terminal state intrinsic to the core environment, in which case further step() calls will return undefined results
+            truncated (bool): whether the episode has ended due to a truncation, i.e., a timelimit outside the scope of the problem defined in the environment.
             info (dictionary): A dictionary that may contain additional information regarding the reason for a ``done`` signal.
                 `info` contains auxiliary diagnostic information (helpful for debugging, learning, and logging).
                 This might, for instance, contain: metrics that describe the agent's performance state, variables that are
                 hidden from observations, information that distinguishes truncation and termination or individual reward terms
                 that are combined to produce the total reward
+
+            (deprecated)
+            done (bool): A boolean value for if the episode has ended, in which case further :meth:`step` calls will return undefined results.
+                A done signal may be emitted for different reasons: Maybe the task underlying the environment was solved successfully,
+                a certain timelimit was exceeded, or the physics simulation has entered an invalid state.
         """
         raise NotImplementedError
 
@@ -242,18 +251,20 @@ class Wrapper(Env[ObsType, ActType]):
         Don't forget to call ``super().__init__(env)`` if the subclass overrides :meth:`__init__`.
     """
 
-    def __init__(self, env: Env):
+    def __init__(self, env: Env, new_step_api: bool = False):
         """Wraps an environment to allow a modular transformation of the :meth:`step` and :meth:`reset` methods.
 
         Args:
             env: The environment to wrap
+            new_step_api: Whether the wrapper's step method will output in new or old step API
         """
         self.env = env
 
         self._action_space: Optional[spaces.Space] = None
         self._observation_space: Optional[spaces.Space] = None
         self._reward_range: Optional[Tuple[SupportsFloat, SupportsFloat]] = None
         self._metadata: Optional[dict] = None
+        self.new_step_api = new_step_api
 
     def __getattr__(self, name):
         """Returns an attribute with ``name``, unless ``name`` starts with an underscore."""
@@ -315,9 +326,17 @@ def metadata(self) -> dict:
     def metadata(self, value):
         self._metadata = value
 
-    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
+    def step(
+        self, action: ActType
+    ) -> Union[
+        Tuple[ObsType, float, bool, bool, dict], Tuple[ObsType, float, bool, dict]
+    ]:
         """Steps through the environment with action."""
-        return self.env.step(action)
+        from gym.utils.step_api_compatibility import (  # avoid circular import
+            step_api_compatibility,
+        )
+
+        return step_api_compatibility(self.env.step(action), self.new_step_api)
 
     def reset(self, **kwargs) -> Union[ObsType, Tuple[ObsType, dict]]:
         """Resets the environment with kwargs."""
@@ -387,8 +406,13 @@ def reset(self, **kwargs):
 
     def step(self, action):
         """Returns a modified observation using :meth:`self.observation` after calling :meth:`env.step`."""
-        observation, reward, done, info = self.env.step(action)
-        return self.observation(observation), reward, done, info
+        step_returns = self.env.step(action)
+        if len(step_returns) == 5:
+            observation, reward, terminated, truncated, info = step_returns
+            return self.observation(observation), reward, terminated, truncated, info
+        else:
+            observation, reward, done, info = step_returns
+            return self.observation(observation), reward, done, info
 
     def observation(self, observation):
         """Returns a modified observation."""
@@ -421,8 +445,13 @@ def reward(self, reward):
 
     def step(self, action):
         """Modifies the reward using :meth:`self.reward` after the environment :meth:`env.step`."""
-        observation, reward, done, info = self.env.step(action)
-        return observation, self.reward(reward), done, info
+        step_returns = self.env.step(action)
+        if len(step_returns) == 5:
+            observation, reward, terminated, truncated, info = step_returns
+            return observation, self.reward(reward), terminated, truncated, info
+        else:
+            observation, reward, done, info = step_returns
+            return observation, self.reward(reward), done, info
 
     def reward(self, reward):
         """Returns a modified ``reward``."""

diff --git a/gym/envs/box2d/bipedal_walker.py b/gym/envs/box2d/bipedal_walker.py
@@ -583,13 +583,13 @@ def step(self, action: np.ndarray):
             reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1)
             # normalized to about -50.0 using heuristic, more optimal agent should spend less
 
-        done = False
+        terminated = False
         if self.game_over or pos[0] < 0:
             reward = -100
-            done = True
+            terminated = True
         if pos[0] > (TERRAIN_LENGTH - TERRAIN_GRASS) * TERRAIN_STEP:
-            done = True
-        return np.array(state, dtype=np.float32), reward, done, {}
+            terminated = True
+        return np.array(state, dtype=np.float32), reward, terminated, False, {}
 
     def render(self, mode: str = "human"):
         try:
@@ -764,9 +764,9 @@ def __init__(self):
     SUPPORT_KNEE_ANGLE = +0.1
     supporting_knee_angle = SUPPORT_KNEE_ANGLE
     while True:
-        s, r, done, info = env.step(a)
+        s, r, terminated, truncated, info = env.step(a)
         total_reward += r
-        if steps % 20 == 0 or done:
+        if steps % 20 == 0 or terminated or truncated:
             print("\naction " + str([f"{x:+0.2f}" for x in a]))
             print(f"step {steps} total_reward {total_reward:+0.2f}")
             print("hull " + str([f"{x:+0.2f}" for x in s[0:4]]))
@@ -830,5 +830,5 @@ def __init__(self):
         a = np.clip(0.5 * a, -1.0, 1.0)
 
         env.render()
-        if done:
+        if terminated or truncated:
             break
diff --git a/gym/envs/box2d/car_racing.py b/gym/envs/box2d/car_racing.py
@@ -469,7 +469,7 @@ def step(self, action: Union[np.ndarray, int]):
         self.state = self.render("state_pixels")
 
         step_reward = 0
-        done = False
+        terminated = False
         if action is not None:  # First step without action, called from reset()
             self.reward -= 0.1
             # We actually don't want to count fuel spent, we want car to be faster.
@@ -478,13 +478,13 @@ def step(self, action: Union[np.ndarray, int]):
             step_reward = self.reward - self.prev_reward
             self.prev_reward = self.reward
             if self.tile_visited_count == len(self.track) or self.new_lap:
-                done = True
+                terminated = True
             x, y = self.car.hull.position
             if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD:
-                done = True
+                terminated = True
                 step_reward = -100
 
-        return self.state, step_reward, done, {}
+        return self.state, step_reward, terminated, False, {}
 
     def render(self, mode: str = "human"):
         try:
@@ -733,13 +733,13 @@ def register_input():
         restart = False
         while True:
             register_input()
-            s, r, done, info = env.step(a)
+            s, r, terminated, truncated, info = env.step(a)
             total_reward += r
-            if steps % 200 == 0 or done:
+            if steps % 200 == 0 or terminated or truncated:
                 print("\naction " + str([f"{x:+0.2f}" for x in a]))
                 print(f"step {steps} total_reward {total_reward:+0.2f}")
             steps += 1
             isopen = env.render()
-            if done or restart or isopen is False:
+            if terminated or truncated or restart or isopen is False:
                 break
     env.close()
diff --git a/gym/envs/box2d/lunar_lander.py b/gym/envs/box2d/lunar_lander.py
@@ -560,14 +560,14 @@ def step(self, action):
         )  # less fuel spent is better, about -30 for heuristic landing
         reward -= s_power * 0.03
 
-        done = False
+        terminated = False
         if self.game_over or abs(state[0]) >= 1.0:
-            done = True
+            terminated = True
             reward = -100
         if not self.lander.awake:
-            done = True
+            terminated = True
             reward = +100
-        return np.array(state, dtype=np.float32), reward, done, {}
+        return np.array(state, dtype=np.float32), reward, terminated, False, {}
 
     def render(self, mode="human"):
         try:
@@ -748,19 +748,19 @@ def demo_heuristic_lander(env, seed=None, render=False):
     s = env.reset(seed=seed)
     while True:
         a = heuristic(env, s)
-        s, r, done, info = env.step(a)
+        s, r, terminated, truncated, info = env.step(a)
         total_reward += r
 
         if render:
             still_open = env.render()
             if still_open is False:
                 break
 
-        if steps % 20 == 0 or done:
+        if steps % 20 == 0 or terminated or truncated:
             print("observations:", " ".join([f"{x:+0.2f}" for x in s]))
             print(f"step {steps} total_reward {total_reward:+0.2f}")
         steps += 1
-        if done:
+        if terminated or truncated:
             break
     if render:
         env.close()

diff --git a/gym/envs/classic_control/acrobot.py b/gym/envs/classic_control/acrobot.py
@@ -84,12 +84,12 @@ class AcrobotEnv(core.Env):
     Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized
     uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity.
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if one of the following occurs:
-    1. The free end reaches the target height, which is constructed as:
+    The episode ends if one of the following occurs:
+    1. Termination: The free end reaches the target height, which is constructed as:
     `-cos(theta1) - cos(theta2 + theta1) > 1.0`
-    2. Episode length is greater than 500 (200 for v0)
+    2. Truncation: Episode length is greater than 500 (200 for v0)
 
     ### Arguments
 
@@ -211,9 +211,9 @@ def step(self, a):
         ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
         ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
         self.state = ns
-        terminal = self._terminal()
-        reward = -1.0 if not terminal else 0.0
-        return (self._get_ob(), reward, terminal, {})
+        terminated = self._terminal()
+        reward = -1.0 if not terminated else 0.0
+        return (self._get_ob(), reward, terminated, False, {})
 
     def _get_ob(self):
         s = self.state

diff --git a/gym/envs/classic_control/cartpole.py b/gym/envs/classic_control/cartpole.py
@@ -63,12 +63,13 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
 
     All observations are assigned a uniformly random value in `(-0.05, 0.05)`
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if any one of the following occurs:
-    1. Pole Angle is greater than ±12°
-    2. Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
-    3. Episode length is greater than 500 (200 for v0)
+    The episode ends if any one of the following occurs:
+
+    1. Termination: Pole Angle is greater than ±12°
+    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
+    3. Truncation: Episode length is greater than 500 (200 for v0)
 
     ### Arguments
 
@@ -116,7 +117,7 @@ def __init__(self):
         self.isopen = True
         self.state = None
 
-        self.steps_beyond_done = None
+        self.steps_beyond_terminated = None
 
     def step(self, action):
         err_msg = f"{action!r} ({type(action)}) invalid"
@@ -150,31 +151,31 @@ def step(self, action):
 
         self.state = (x, x_dot, theta, theta_dot)
 
-        done = bool(
+        terminated = bool(
             x < -self.x_threshold
             or x > self.x_threshold
             or theta < -self.theta_threshold_radians
             or theta > self.theta_threshold_radians
         )
 
-        if not done:
+        if not terminated:
             reward = 1.0
-        elif self.steps_beyond_done is None:
+        elif self.steps_beyond_terminated is None:
             # Pole just fell!
-            self.steps_beyond_done = 0
+            self.steps_beyond_terminated = 0
             reward = 1.0
         else:
-            if self.steps_beyond_done == 0:
+            if self.steps_beyond_terminated == 0:
                 logger.warn(
                     "You are calling 'step()' even though this "
-                    "environment has already returned done = True. You "
-                    "should always call 'reset()' once you receive 'done = "
+                    "environment has already returned terminated = True. You "
+                    "should always call 'reset()' once you receive 'terminated = "
                     "True' -- any further steps are undefined behavior."
                 )
-            self.steps_beyond_done += 1
+            self.steps_beyond_terminated += 1
             reward = 0.0
 
-        return np.array(self.state, dtype=np.float32), reward, done, {}
+        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
 
     def reset(
         self,
@@ -185,7 +186,7 @@ def reset(
     ):
         super().reset(seed=seed)
         self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
-        self.steps_beyond_done = None
+        self.steps_beyond_terminated = None
         if not return_info:
             return np.array(self.state, dtype=np.float32)
         else:

diff --git a/gym/envs/classic_control/continuous_mountain_car.py b/gym/envs/classic_control/continuous_mountain_car.py
@@ -82,11 +82,11 @@ class Continuous_MountainCarEnv(gym.Env):
     The position of the car is assigned a uniform random value in `[-0.6 , -0.4]`.
     The starting velocity of the car is always assigned to 0.
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if either of the following happens:
-    1. The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill)
-    2. The length of the episode is 999.
+    The episode ends if either of the following happens:
+    1. Termination: The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill)
+    2. Truncation: The length of the episode is 999.
 
     ### Arguments
 
@@ -151,15 +151,17 @@ def step(self, action: np.ndarray):
             velocity = 0
 
         # Convert a possible numpy bool to a Python bool.
-        done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
 
         reward = 0
-        if done:
+        if terminated:
             reward = 100.0
         reward -= math.pow(action[0], 2) * 0.1
 
         self.state = np.array([position, velocity], dtype=np.float32)
-        return self.state, reward, done, {}
+        return self.state, reward, terminated, False, {}
 
     def reset(
         self,