openai · jkterry1 · Jul 9, 2022 · Apr 14, 2022 · Apr 20, 2022 · Apr 20, 2022
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ observation, info = env.reset(seed=42, return_info=True)
 
 for _ in range(1000):
     action = env.action_space.sample()
-    observation, reward, done, info = env.step(action)
+    observation, reward, terminated, truncated, info = env.step(action)
 
     if done:
         observation, info = env.reset(return_info=True)

diff --git a/gym/core.py b/gym/core.py
@@ -61,12 +61,17 @@ def np_random(self, value: RandomNumberGenerator):
         self._np_random = value
 
     @abstractmethod
-    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
+    def step(
+        self, action: ActType
+    ) -> Union[
+        Tuple[ObsType, float, bool, bool, dict], Tuple[ObsType, float, bool, dict]
+    ]:
         """Run one timestep of the environment's dynamics. When end of
         episode is reached, you are responsible for calling :meth:`reset`
         to reset this environment's state.
 
-        Accepts an action and returns a tuple (observation, reward, done, info).
+        Accepts an action and returns either a tuple (observation, reward, terminated, truncated, info) or a tuple
+        (observation, reward, done, info). The latter is deprecated and will be removed in future versions.
 
         Args:
             action (object): an action provided by the agent
@@ -76,13 +81,17 @@ def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
         Returns:
             observation (object): agent's observation of the current environment. This will be an element of the environment's :attr:`observation_space`. This may, for instance, be a numpy array containing the positions and velocities of certain objects.
             reward (float) : amount of reward returned after previous action
-            done (bool): whether the episode has ended, in which case further :meth:`step` calls will return undefined results. A done signal may be emitted for different reasons: Maybe the task underlying the environment was solved successfully, a certain timelimit was exceeded, or the physics simulation has entered an invalid state. ``info`` may contain additional information regarding the reason for a ``done`` signal.
+            terminated (bool): whether the episode has ended due to a termination, in which case further step() calls will return undefined results
+            truncated (bool): whether the episode has ended due to a truncation, in which case further step() calls will return undefined results
             info (dict): contains auxiliary diagnostic information (helpful for debugging, learning, and logging). This might, for instance, contain:
 
                 - metrics that describe the agent's performance or
                 - state variables that are hidden from observations or
                 - information that distinguishes truncation and termination or
                 - individual reward terms that are combined to produce the total reward
+
+            (deprecated)
+            done (bool): whether the episode has ended due to any reason, in which case further step() calls will return undefined results
         """
         raise NotImplementedError
 
@@ -290,7 +299,11 @@ def metadata(self) -> dict:
     def metadata(self, value):
         self._metadata = value
 
-    def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
+    def step(
+        self, action: ActType
+    ) -> Union[
+        Tuple[ObsType, float, bool, bool, dict], Tuple[ObsType, float, bool, dict]
+    ]:
         return self.env.step(action)
 
     def reset(self, **kwargs) -> Union[ObsType, tuple[ObsType, dict]]:
@@ -325,8 +338,13 @@ def reset(self, **kwargs):
             return self.observation(self.env.reset(**kwargs))
 
     def step(self, action):
-        observation, reward, done, info = self.env.step(action)
-        return self.observation(observation), reward, done, info
+        step_returns = self.env.step(action)
+        if len(step_returns) == 5:
+            observation, reward, terminated, truncated, info = step_returns
+            return self.observation(observation), reward, terminated, truncated, info
+        else:
+            observation, reward, done, info = step_returns
+            return self.observation(observation), reward, done, info
 
     @abstractmethod
     def observation(self, observation):
@@ -338,8 +356,13 @@ def reset(self, **kwargs):
         return self.env.reset(**kwargs)
 
     def step(self, action):
-        observation, reward, done, info = self.env.step(action)
-        return observation, self.reward(reward), done, info
+        step_returns = self.env.step(action)
+        if len(step_returns) == 5:
+            observation, reward, terminated, truncated, info = step_returns
+            return observation, self.reward(reward), terminated, truncated, info
+        else:
+            observation, reward, done, info = step_returns
+            return observation, self.reward(reward), done, info
 
     @abstractmethod
     def reward(self, reward):

diff --git a/gym/envs/box2d/bipedal_walker.py b/gym/envs/box2d/bipedal_walker.py
@@ -581,13 +581,13 @@ def step(self, action: np.ndarray):
             reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1)
             # normalized to about -50.0 using heuristic, more optimal agent should spend less
 
-        done = False
+        terminated = False
         if self.game_over or pos[0] < 0:
             reward = -100
-            done = True
+            terminated = True
         if pos[0] > (TERRAIN_LENGTH - TERRAIN_GRASS) * TERRAIN_STEP:
-            done = True
-        return np.array(state, dtype=np.float32), reward, done, {}
+            terminated = True
+        return np.array(state, dtype=np.float32), reward, terminated, False, {}
 
     def render(self, mode: str = "human"):
         import pygame
@@ -757,9 +757,9 @@ def __init__(self):
     SUPPORT_KNEE_ANGLE = +0.1
     supporting_knee_angle = SUPPORT_KNEE_ANGLE
     while True:
-        s, r, done, info = env.step(a)
+        s, r, terminated, truncated, info = env.step(a)
         total_reward += r
-        if steps % 20 == 0 or done:
+        if steps % 20 == 0 or terminated or truncated:
             print("\naction " + str([f"{x:+0.2f}" for x in a]))
             print(f"step {steps} total_reward {total_reward:+0.2f}")
             print("hull " + str([f"{x:+0.2f}" for x in s[0:4]]))
@@ -823,5 +823,5 @@ def __init__(self):
         a = np.clip(0.5 * a, -1.0, 1.0)
 
         env.render()
-        if done:
+        if terminated or truncated:
             break
diff --git a/gym/envs/box2d/car_racing.py b/gym/envs/box2d/car_racing.py
@@ -415,7 +415,7 @@ def step(self, action):
         self.state = self.render("state_pixels")
 
         step_reward = 0
-        done = False
+        terminated = False
         if action is not None:  # First step without action, called from reset()
             self.reward -= 0.1
             # We actually don't want to count fuel spent, we want car to be faster.
@@ -424,13 +424,13 @@ def step(self, action):
             step_reward = self.reward - self.prev_reward
             self.prev_reward = self.reward
             if self.tile_visited_count == len(self.track) or self.new_lap:
-                done = True
+                terminated = True
             x, y = self.car.hull.position
             if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD:
-                done = True
+                terminated = True
                 step_reward = -100
 
-        return self.state, step_reward, done, {}
+        return self.state, step_reward, terminated, False, {}
 
     def render(self, mode="human"):
         import pygame
@@ -660,13 +660,13 @@ def register_input():
         restart = False
         while True:
             register_input()
-            s, r, done, info = env.step(a)
+            s, r, terminated, truncated, info = env.step(a)
             total_reward += r
-            if steps % 200 == 0 or done:
+            if steps % 200 == 0 or terminated or truncated:
                 print("\naction " + str([f"{x:+0.2f}" for x in a]))
                 print(f"step {steps} total_reward {total_reward:+0.2f}")
             steps += 1
             isopen = env.render()
-            if done or restart or isopen == False:
+            if terminated or truncated or restart or isopen == False:
                 break
     env.close()
diff --git a/gym/envs/box2d/lunar_lander.py b/gym/envs/box2d/lunar_lander.py
@@ -473,14 +473,14 @@ def step(self, action):
         )  # less fuel spent is better, about -30 for heuristic landing
         reward -= s_power * 0.03
 
-        done = False
+        terminated = False
         if self.game_over or abs(state[0]) >= 1.0:
-            done = True
+            terminated = True
             reward = -100
         if not self.lander.awake:
-            done = True
+            terminated = True
             reward = +100
-        return np.array(state, dtype=np.float32), reward, done, {}
+        return np.array(state, dtype=np.float32), reward, terminated, False, {}
 
     def render(self, mode="human"):
         import pygame
@@ -654,19 +654,19 @@ def demo_heuristic_lander(env, seed=None, render=False):
     s = env.reset(seed=seed)
     while True:
         a = heuristic(env, s)
-        s, r, done, info = env.step(a)
+        s, r, terminated, truncated, info = env.step(a)
         total_reward += r
 
         if render:
             still_open = env.render()
             if still_open == False:
                 break
 
-        if steps % 20 == 0 or done:
+        if steps % 20 == 0 or terminated or truncated:
             print("observations:", " ".join([f"{x:+0.2f}" for x in s]))
             print(f"step {steps} total_reward {total_reward:+0.2f}")
         steps += 1
-        if done:
+        if terminated or truncated:
             break
     if render:
         env.close()

diff --git a/gym/envs/classic_control/acrobot.py b/gym/envs/classic_control/acrobot.py
@@ -82,12 +82,12 @@ class AcrobotEnv(core.Env):
     Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized
     uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity.
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if one of the following occurs:
-    1. The free end reaches the target height, which is constructed as:
+    The episode ends if one of the following occurs:
+    1. Termination: The free end reaches the target height, which is constructed as:
     `-cos(theta1) - cos(theta2 + theta1) > 1.0`
-    2. Episode length is greater than 500 (200 for v0)
+    2. Truncation: Episode length is greater than 500 (200 for v0)
 
     ### Arguments
 
@@ -206,9 +206,9 @@ def step(self, a):
         ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
         ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
         self.state = ns
-        terminal = self._terminal()
-        reward = -1.0 if not terminal else 0.0
-        return (self._get_ob(), reward, terminal, {})
+        terminated = self._terminal()
+        reward = -1.0 if not terminated else 0.0
+        return (self._get_ob(), reward, terminated, False, {})
 
     def _get_ob(self):
         s = self.state

diff --git a/gym/envs/classic_control/cartpole.py b/gym/envs/classic_control/cartpole.py
@@ -56,12 +56,13 @@ class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
 
     All observations are assigned a uniformly random value in `(-0.05, 0.05)`
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if any one of the following occurs:
-    1. Pole Angle is greater than ±12°
-    2. Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
-    3. Episode length is greater than 500 (200 for v0)
+    The episode ends if any one of the following occurs:
+
+    1. Termination: Pole Angle is greater than ±12°
+    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
+    3. Truncation: Episode length is greater than 500 (200 for v0)
 
     ### Arguments
 
@@ -109,7 +110,7 @@ def __init__(self):
         self.isopen = True
         self.state = None
 
-        self.steps_beyond_done = None
+        self.steps_beyond_terminated = None
 
     def step(self, action):
         err_msg = f"{action!r} ({type(action)}) invalid"
@@ -143,31 +144,31 @@ def step(self, action):
 
         self.state = (x, x_dot, theta, theta_dot)
 
-        done = bool(
+        terminated = bool(
             x < -self.x_threshold
             or x > self.x_threshold
             or theta < -self.theta_threshold_radians
             or theta > self.theta_threshold_radians
         )
 
-        if not done:
+        if not terminated:
             reward = 1.0
-        elif self.steps_beyond_done is None:
+        elif self.steps_beyond_terminated is None:
             # Pole just fell!
-            self.steps_beyond_done = 0
+            self.steps_beyond_terminated = 0
             reward = 1.0
         else:
-            if self.steps_beyond_done == 0:
+            if self.steps_beyond_terminated == 0:
                 logger.warn(
                     "You are calling 'step()' even though this "
-                    "environment has already returned done = True. You "
-                    "should always call 'reset()' once you receive 'done = "
+                    "environment has already returned terminated = True. You "
+                    "should always call 'reset()' once you receive 'terminated = "
                     "True' -- any further steps are undefined behavior."
                 )
-            self.steps_beyond_done += 1
+            self.steps_beyond_terminated += 1
             reward = 0.0
 
-        return np.array(self.state, dtype=np.float32), reward, done, {}
+        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
 
     def reset(
         self,
@@ -178,7 +179,7 @@ def reset(
     ):
         super().reset(seed=seed)
         self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
-        self.steps_beyond_done = None
+        self.steps_beyond_terminated = None
         if not return_info:
             return np.array(self.state, dtype=np.float32)
         else:

diff --git a/gym/envs/classic_control/continuous_mountain_car.py b/gym/envs/classic_control/continuous_mountain_car.py
@@ -76,11 +76,11 @@ class Continuous_MountainCarEnv(gym.Env):
 
     The position of the car is assigned a uniform random value in `[-0.6 , -0.4]`. The starting velocity of the car is always assigned to 0.
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if either of the following happens:
-    1. The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill)
-    2. The length of the episode is 999.
+    The episode ends if either of the following happens:
+    1. Termination: The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill)
+    2. Truncation: The length of the episode is 999.
 
     ### Arguments
 
@@ -145,15 +145,17 @@ def step(self, action: np.ndarray):
             velocity = 0
 
         # Convert a possible numpy bool to a Python bool.
-        done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
 
         reward = 0
-        if done:
+        if terminated:
             reward = 100.0
         reward -= math.pow(action[0], 2) * 0.1
 
         self.state = np.array([position, velocity], dtype=np.float32)
-        return self.state, reward, done, {}
+        return self.state, reward, terminated, False, {}
 
     def reset(
         self,

diff --git a/gym/envs/classic_control/mountain_car.py b/gym/envs/classic_control/mountain_car.py
@@ -72,11 +72,11 @@ class MountainCarEnv(gym.Env):
 
     The position of the car is assigned a uniform random value in *[-0.6 , -0.4]*. The starting velocity of the car is always assigned to 0.
 
-    ### Episode Termination
+    ### Episode End
 
-    The episode terminates if either of the following happens:
-    1. The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)
-    2. The length of the episode is 200.
+    The episode ends if either of the following happens:
+    1. Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)
+    2. Truncation: The length of the episode is 200.
 
 
     ### Arguments
@@ -125,11 +125,13 @@ def step(self, action: int):
         if position == self.min_position and velocity < 0:
             velocity = 0
 
-        done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
         reward = -1.0
 
         self.state = (position, velocity)
-        return np.array(self.state, dtype=np.float32), reward, done, {}
+        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
 
     def reset(
         self,