diff --git a/.github/ISSUE_TEMPLATE/custom_env.yml b/.github/ISSUE_TEMPLATE/custom_env.yml index cf624c03b..f90210858 100644 --- a/.github/ISSUE_TEMPLATE/custom_env.yml +++ b/.github/ISSUE_TEMPLATE/custom_env.yml @@ -49,15 +49,16 @@ body: self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(14,)) self.action_space = spaces.Box(low=-1, high=1, shape=(6,)) - def reset(self): - return self.observation_space.sample() + def reset(self, seed=None): + return self.observation_space.sample(), {} def step(self, action): obs = self.observation_space.sample() reward = 1.0 - done = False + terminated = False + truncated = False info = {} - return obs, reward, done, info + return obs, reward, terminated, truncated, info env = CustomEnv() check_env(env) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52f5f3895..7c238cdd4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,6 +55,8 @@ jobs: - name: Type check run: | make type + # skip mypy type check for python3.7 (result is different to all other versions) + if: "!(matrix.python-version == '3.7')" - name: Test with pytest run: | make pytest diff --git a/Dockerfile b/Dockerfile index 8dfbbbf4c..421324dff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,41 +1,25 @@ ARG PARENT_IMAGE FROM $PARENT_IMAGE ARG PYTORCH_DEPS=cpuonly -ARG PYTHON_VERSION=3.7 +ARG PYTHON_VERSION=3.8 +ARG MAMBA_DOCKERFILE_ACTIVATE=1 # (otherwise python will not be found) -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ - curl \ - ca-certificates \ - libjpeg-dev \ - libpng-dev \ - libglib2.0-0 && \ - rm -rf /var/lib/apt/lists/* +# Install micromamba env and dependencies +RUN micromamba install -n base -y python=$PYTHON_VERSION \ + pytorch $PYTORCH_DEPS -c conda-forge -c pytorch -c nvidia && \ + micromamba clean --all --yes -# Install Anaconda and dependencies -RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include && \ - /opt/conda/bin/conda install -y pytorch $PYTORCH_DEPS -c pytorch && \ - /opt/conda/bin/conda clean -ya -ENV PATH /opt/conda/bin:$PATH - -ENV CODE_DIR /root/code +ENV CODE_DIR /home/$MAMBA_USER # Copy setup file only to install dependencies -COPY ./setup.py ${CODE_DIR}/stable-baselines3/setup.py -COPY ./stable_baselines3/version.txt ${CODE_DIR}/stable-baselines3/stable_baselines3/version.txt +COPY --chown=$MAMBA_USER:$MAMBA_USER ./setup.py ${CODE_DIR}/stable-baselines3/setup.py +COPY --chown=$MAMBA_USER:$MAMBA_USER ./stable_baselines3/version.txt ${CODE_DIR}/stable-baselines3/stable_baselines3/version.txt -RUN \ - cd ${CODE_DIR}/stable-baselines3 3&& \ +RUN cd ${CODE_DIR}/stable-baselines3 && \ pip install -e .[extra,tests,docs] && \ # Use headless version for docker pip uninstall -y opencv-python && \ pip install opencv-python-headless && \ - rm -rf $HOME/.cache/pip + pip cache purge CMD /bin/bash diff --git a/Makefile b/Makefile index 29ac5e70e..4f477d066 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,12 @@ pytype: mypy: mypy ${LINT_PATHS} +missing-annotations: + mypy --disallow-untyped-calls --disallow-untyped-defs --ignore-missing-imports stable_baselines3 + +# missing docstrings +# pylint -d R,C,W,E -e C0116 stable_baselines3 -j 4 + type: pytype mypy lint: diff --git a/docs/conda_env.yml b/docs/conda_env.yml index 98a550820..0545eef3c 100644 --- a/docs/conda_env.yml +++ b/docs/conda_env.yml @@ -4,11 +4,11 @@ channels: - defaults dependencies: - cpuonly=1.0=0 - - pip=21.1 + - pip=22.3.1 - python=3.7 - - pytorch=1.11=py3.7_cpu_0 + - pytorch=1.11.0=py3.7_cpu_0 - pip: - - gym==0.21 + - gymnasium - cloudpickle - opencv-python-headless - pandas diff --git a/docs/guide/callbacks.rst b/docs/guide/callbacks.rst index 632743bbf..e08c00367 100644 --- a/docs/guide/callbacks.rst +++ b/docs/guide/callbacks.rst @@ -210,7 +210,7 @@ It will save the best model if ``best_model_save_path`` folder is specified and .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import SAC from stable_baselines3.common.callbacks import EvalCallback @@ -260,7 +260,7 @@ Alternatively, you can pass directly a list of callbacks to the ``learn()`` meth .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import SAC from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback @@ -290,7 +290,7 @@ It must be used with the :ref:`EvalCallback` and use the event triggered by a ne .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import SAC from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold @@ -322,7 +322,7 @@ An :ref:`EventCallback` that will trigger its child callback every ``n_steps`` t .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import PPO from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps @@ -379,7 +379,7 @@ It must be used with the :ref:`EvalCallback` and use the event triggered after e .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import SAC from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement diff --git a/docs/guide/checking_nan.rst b/docs/guide/checking_nan.rst index ef3762c41..8d0de36e8 100644 --- a/docs/guide/checking_nan.rst +++ b/docs/guide/checking_nan.rst @@ -100,8 +100,8 @@ It will monitor the actions, observations, and rewards, indicating what action o .. code-block:: python - import gym - from gym import spaces + import gymnasium as gym + from gymnasium import spaces import numpy as np from stable_baselines3 import PPO @@ -129,7 +129,7 @@ It will monitor the actions, observations, and rewards, indicating what action o def reset(self): return [0.0] - def render(self, mode="human", close=False): + def render(self, close=False): pass # Create environment diff --git a/docs/guide/custom_env.rst b/docs/guide/custom_env.rst index d2878c376..822c8215e 100644 --- a/docs/guide/custom_env.rst +++ b/docs/guide/custom_env.rst @@ -26,9 +26,9 @@ That is to say, your environment must implement the following methods (and inher .. code-block:: python - import gym + import gymnasium as gym import numpy as np - from gym import spaces + from gymnasium import spaces class CustomEnv(gym.Env): @@ -54,7 +54,7 @@ That is to say, your environment must implement the following methods (and inher ... return observation # reward, done, info can't be included - def render(self, mode="human"): + def render(self): ... def close(self): @@ -91,7 +91,7 @@ Optionally, you can also register the environment with gym, that will allow you .. code-block:: python - from gym.envs.registration import register + from gymnasium.envs.registration import register # Example for the CartPole environment register( # unique identifier for the env `name-version` diff --git a/docs/guide/custom_policy.rst b/docs/guide/custom_policy.rst index 80f69396a..af7d9ef0a 100644 --- a/docs/guide/custom_policy.rst +++ b/docs/guide/custom_policy.rst @@ -101,7 +101,7 @@ using ``policy_kwargs`` parameter: .. code-block:: python - import gym + import gymnasium as gym import torch as th from stable_baselines3 import PPO @@ -143,7 +143,7 @@ that derives from ``BaseFeaturesExtractor`` and then pass it to the model when t import torch as th import torch.nn as nn - from gym import spaces + from gymnasium import spaces from stable_baselines3 import PPO from stable_baselines3.common.torch_layers import BaseFeaturesExtractor @@ -208,7 +208,7 @@ downsampling and "vector" with a single linear layer. .. code-block:: python - import gym + import gymnasium as gym import torch as th from torch import nn @@ -308,7 +308,7 @@ If your task requires even more granular control over the policy/value architect from typing import Callable, Dict, List, Optional, Tuple, Type, Union - from gym import spaces + from gymnasium import spaces import torch as th from torch import nn diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index a3f1dc6f0..5935f504a 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -64,7 +64,7 @@ In the following example, we will train, save and load a DQN model on the Lunar .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import DQN from stable_baselines3.common.evaluation import evaluate_policy @@ -115,7 +115,7 @@ Multiprocessing: Unleashing the Power of Vectorized Environments .. code-block:: python - import gym + import gymnasium as gym import numpy as np from stable_baselines3 import PPO @@ -123,18 +123,18 @@ Multiprocessing: Unleashing the Power of Vectorized Environments from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.utils import set_random_seed - def make_env(env_id, rank, seed=0): + def make_env(env_id: str, rank: int, seed: int = 0): """ Utility function for multiprocessed env. - :param env_id: (str) the environment ID - :param num_env: (int) the number of environments you wish to have in subprocesses - :param seed: (int) the inital seed for RNG - :param rank: (int) index of the subprocess + :param env_id: the environment ID + :param num_env: the number of environments you wish to have in subprocesses + :param seed: the inital seed for RNG + :param rank: index of the subprocess """ def _init(): - env = gym.make(env_id) - env.seed(seed + rank) + env = gym.make(env_id, render_mode="human") + env.reset(seed=seed + rank) return env set_random_seed(seed) return _init @@ -143,21 +143,21 @@ Multiprocessing: Unleashing the Power of Vectorized Environments env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use # Create the vectorized environment - env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) + vec_env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you. # You can choose between `DummyVecEnv` (usually faster) and `SubprocVecEnv` # env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv) - model = PPO("MlpPolicy", env, verbose=1) + model = PPO("MlpPolicy", vec_env, verbose=1) model.learn(total_timesteps=25_000) - obs = env.reset() + obs = vec_env.reset() for _ in range(1000): action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() + obs, rewards, dones, info = vec_env.step(action) + vec_env.render() Multiprocessing with off-policy algorithms @@ -173,17 +173,17 @@ Multiprocessing with off-policy algorithms .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import SAC from stable_baselines3.common.env_util import make_vec_env - env = make_vec_env("Pendulum-v0", n_envs=4, seed=0) + vec_env = make_vec_env("Pendulum-v0", n_envs=4, seed=0) # We collect 4 transitions per call to `ènv.step()` # and performs 2 gradient steps per call to `ènv.step()` # if gradient_steps=-1, then we would do 4 gradients steps per call to `ènv.step()` - model = SAC("MlpPolicy", env, train_freq=1, gradient_steps=2, verbose=1) + model = SAC("MlpPolicy", vec_env, train_freq=1, gradient_steps=2, verbose=1) model.learn(total_timesteps=10_000) @@ -229,7 +229,7 @@ If your callback returns False, training is aborted early. import os - import gym + import gymnasium as gym import numpy as np import matplotlib.pyplot as plt @@ -337,18 +337,18 @@ and multiprocessing for you. To install the Atari environments, run the command # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multi-worker training (n_envs=4 => 4 environments) - env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0) + vec_env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0) # Frame-stacking with 4 frames - env = VecFrameStack(env, n_stack=4) + vec_env = VecFrameStack(vec_env, n_stack=4) - model = A2C("CnnPolicy", env, verbose=1) + model = A2C("CnnPolicy", vec_env, verbose=1) model.learn(total_timesteps=25_000) - obs = env.reset() + obs = vec_env.reset() while True: - action, _states = model.predict(obs) - obs, rewards, dones, info = env.step(action) - env.render() + action, _states = model.predict(obs, deterministic=False) + obs, rewards, dones, info = vec_env.step(action) + vec_env.render("human") PyBullet: Normalizing input features @@ -372,18 +372,22 @@ will compute a running average and standard deviation of input features (it can .. code-block:: python import os - import gym + import gymnasium as gym import pybullet_envs from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize from stable_baselines3 import PPO - env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) + # Note: pybullet is not compatible yet with Gymnasium + # you might need to use `import rl_zoo3.gym_patches` + # and use gym (not Gymnasium) to instanciate the env + # Alternatively, you can use the MuJoCo equivalent "HalfCheetah-v4" + vec_env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) # Automatically normalize the input features and reward - env = VecNormalize(env, norm_obs=True, norm_reward=True, + vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.) - model = PPO("MlpPolicy", env) + model = PPO("MlpPolicy", vec_env) model.learn(total_timesteps=2000) # Don't forget to save the VecNormalize statistics when saving the agent @@ -393,18 +397,18 @@ will compute a running average and standard deviation of input features (it can env.save(stats_path) # To demonstrate loading - del model, env + del model, vec_env # Load the saved statistics - env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) - env = VecNormalize.load(stats_path, env) + vec_env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) + vec_env = VecNormalize.load(stats_path, vec_env) # do not update them at test time - env.training = False + vec_env.training = False # reward normalization is not needed at test time - env.norm_reward = False + vec_env.norm_reward = False # Load the agent - model = PPO.load(log_dir + "ppo_halfcheetah", env=env) + model = PPO.load(log_dir + "ppo_halfcheetah", env=vec_env) Hindsight Experience Replay (HER) @@ -430,7 +434,7 @@ The parking env is a goal-conditioned continuous control task, in which the vehi .. code-block:: python - import gym + import gymnasium as gym import highway_env import numpy as np @@ -467,19 +471,19 @@ The parking env is a goal-conditioned continuous control task, in which the vehi # HER must be loaded with the env model = SAC.load("her_sac_highway", env=env) - obs = env.reset() + obs, info = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(100): action, _ = model.predict(obs, deterministic=True) - obs, reward, done, info = env.step(action) + obs, reward, terminated, truncated, info = env.step(action) env.render() episode_reward += reward - if done or info.get("is_success", False): + if terminated or truncated or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 - obs = env.reset() + obs, info = env.reset() Learning Rate Schedule @@ -621,7 +625,7 @@ A2C policy gradient updates on the model. from typing import Dict - import gym + import gymnasium as gym import numpy as np import torch as th @@ -662,7 +666,7 @@ A2C policy gradient updates on the model. # Keep top 10% n_elite = pop_size // 10 # Retrieve the environment - env = model.get_env() + vec_env = model.get_env() for iteration in range(10): # Create population of candidates and evaluate them @@ -674,7 +678,7 @@ A2C policy gradient updates on the model. # we give it (policy parameters) model.policy.load_state_dict(candidate, strict=False) # Evaluate the candidate - fitness, _ = evaluate_policy(model, env) + fitness, _ = evaluate_policy(model, vec_env) population.append((candidate, fitness)) # Take top 10% and use average over their parameters as next mean parameter top_candidates = sorted(population, key=lambda x: x[1], reverse=True)[:n_elite] @@ -738,28 +742,28 @@ Record a mp4 video (here using a random agent). .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv env_id = "CartPole-v1" video_folder = "logs/videos/" video_length = 100 - env = DummyVecEnv([lambda: gym.make(env_id)]) + vec_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")]) - obs = env.reset() + obs = vec_env.reset() # Record the video starting at the first step - env = VecVideoRecorder(env, video_folder, + vec_env = VecVideoRecorder(vec_env, video_folder, record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix=f"random-agent-{env_id}") - env.reset() + vec_env.reset() for _ in range(video_length + 1): - action = [env.action_space.sample()] - obs, _, _, _ = env.step(action) + action = [vec_env.action_space.sample()] + obs, _, _, _ = vec_env.step(action) # Save the video - env.close() + vec_env.close() Bonus: Make a GIF of a Trained Agent diff --git a/docs/guide/install.rst b/docs/guide/install.rst index 312b86fcb..68f8f764f 100644 --- a/docs/guide/install.rst +++ b/docs/guide/install.rst @@ -54,19 +54,6 @@ Bleeding-edge version pip install git+https://github.com/DLR-RM/stable-baselines3 -.. note:: - - If you want to use Gymnasium (or the latest Gym version 0.24+), you have to use - - .. code-block:: bash - - pip install git+https://github.com/DLR-RM/stable-baselines3@feat/gymnasium-support - pip install git+https://github.com/Stable-Baselines-Team/stable-baselines3-contrib@feat/gymnasium-support - - - See `PR #1327 `_ for more information. - - Development version ------------------- @@ -131,7 +118,7 @@ Run the nvidia-docker GPU image .. code-block:: bash - docker run -it --runtime=nvidia --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines3,type=bind stablebaselines/stable-baselines3 bash -c 'cd /root/code/stable-baselines3/ && pytest tests/' + docker run -it --runtime=nvidia --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/home/mamba/stable-baselines3,type=bind stablebaselines/stable-baselines3 bash -c 'cd /home/mamba/stable-baselines3/ && pytest tests/' Or, with the shell file: @@ -143,7 +130,7 @@ Run the docker CPU image .. code-block:: bash - docker run -it --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/root/code/stable-baselines3,type=bind stablebaselines/stable-baselines3-cpu bash -c 'cd /root/code/stable-baselines3/ && pytest tests/' + docker run -it --rm --network host --ipc=host --name test --mount src="$(pwd)",target=/home/mamba/stable-baselines3,type=bind stablebaselines/stable-baselines3-cpu bash -c 'cd /home/mamba/stable-baselines3/ && pytest tests/' Or, with the shell file: @@ -165,7 +152,7 @@ Explanation of the docker command: - ``--name test`` give explicitly the name ``test`` to the container, otherwise it will be assigned a random name - ``--mount src=...`` give access of the local directory (``pwd`` - command) to the container (it will be map to ``/root/code/stable-baselines``), so + command) to the container (it will be map to ``/home/mamba/stable-baselines``), so all the logs created in the container in this folder will be kept - ``bash -c '...'`` Run command inside the docker image, here run the tests (``pytest tests/``) diff --git a/docs/guide/integrations.rst b/docs/guide/integrations.rst index 49bbdb248..14573cdec 100644 --- a/docs/guide/integrations.rst +++ b/docs/guide/integrations.rst @@ -13,7 +13,7 @@ The full documentation is available here: https://docs.wandb.ai/guides/integrati .. code-block:: python - import gym + import gymnasium as gym import wandb from wandb.integration.sb3 import WandbCallback @@ -86,7 +86,7 @@ For instance ``sb3/demo-hf-CartPole-v1``: .. code-block:: python - import gym + import gymnasium as gym from huggingface_sb3 import load_from_hub from stable_baselines3 import PPO diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst index 5d1055ac9..b22ac54da 100644 --- a/docs/guide/quickstart.rst +++ b/docs/guide/quickstart.rst @@ -4,13 +4,19 @@ Getting Started =============== +.. note:: + + Stable-Baselines3 (SB3) uses :ref:`vectorized environments (VecEnv) ` internally. + Please read the associated section to learn more about its features and differences compared to a single Gym environment. + + Most of the library tries to follow a sklearn-like syntax for the Reinforcement Learning algorithms. Here is a quick example of how to train and run A2C on a CartPole environment: .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import A2C diff --git a/docs/guide/tensorboard.rst b/docs/guide/tensorboard.rst index 2699d4a86..720c3ded2 100644 --- a/docs/guide/tensorboard.rst +++ b/docs/guide/tensorboard.rst @@ -190,7 +190,7 @@ Here is an example of how to render an episode and log the resulting video to Te from typing import Any, Dict - import gym + import gymnasium as gym import torch as th from stable_baselines3 import A2C diff --git a/docs/guide/vec_envs.rst b/docs/guide/vec_envs.rst index d84781122..ea99444d1 100644 --- a/docs/guide/vec_envs.rst +++ b/docs/guide/vec_envs.rst @@ -44,6 +44,58 @@ SubprocVecEnv ✔️ ✔️ ✔️ ✔️ ✔️ For more information, see Python's `multiprocessing guidelines `_. +VecEnv API vs Gym API +--------------------- + +For consistency across Stable-Baselines3 (SB3) versions and because of its special requirements and features, +SB3 VecEnv API is not the same as Gym API. +SB3 VecEnv API is actually close to Gym 0.21 API but differs to Gym 0.26+ API: + +- the ``reset()`` method only returns the observation (``obs = vec_env.reset()``) and not a tuple, the info at reset are stored in ``vec_env.reset_infos``. + +- only the initial call to ``vec_env.reset()`` is required, environments are reset automatically afterward (and ``reset_infos`` is updated automatically). + +- the ``vec_env.step(actions)`` method expects an array as input + (with a batch size corresponding to the number of environments) and returns a 4-tuple (and not a 5-tuple): ``obs, rewards, dones, infos`` instead of ``obs, reward, terminated, truncated, info`` + where ``dones = terminated or truncated`` (for each env). + ``obs, rewards, dones`` are numpy arrays with shape ``(n_envs, shape_for_single_env)`` (so with a batch dimension). + Additional information is passed via the ``infos`` value which is a list of dictionaries. + +- at the end of an episode, ``infos[env_idx]["TimeLimit.truncated"] = truncated and not terminated`` + tells the user if an episode was truncated or not: + you should bootstrap if ``infos[env_idx]["TimeLimit.truncated"] is True`` (episode over due to a timeout/truncation) + or ``dones[env_idx] is False`` (episode not finished). + Note: compared to Gym 0.26+ ``infos[env_idx]["TimeLimit.truncated"]`` and ``terminated`` `are mutually exclusive `_. + The conversion from SB3 to Gym API is + + .. code-block:: python + + # done is True at the end of an episode + # dones[env_idx] = terminated[env_idx] or truncated[env_idx] + # In SB3, truncated and terminated are mutually exclusive + # infos[env_idx]["TimeLimit.truncated"] = truncated and not terminated + # terminated[env_idx] tells you whether you should bootstrap or not: + # when the episode has not ended or when the termination was a timeout/truncation + terminated[env_idx] = dones[env_idx] and not infos[env_idx]["TimeLimit.truncated"] + should_bootstrap[env_idx] = not terminated[env_idx] + + +- at the end of an episode, because the environment resets automatically, + we provide ``infos[env_idx]["terminal_observation"]`` which contains the last observation + of an episode (and can be used when bootstrapping, see note in the previous section) + +- to overcome the current Gymnasium limitation (only one render mode allowed per env instance, see `issue #100 `_), + we recommend using ``render_mode="rgb_array"`` since we can both have the image as a numpy array and display it with OpenCV. + if no mode is passed or ``mode="rgb_array"`` is passed when calling ``vec_env.render`` then we use the default mode, otherwise, we use the OpenCV display. + Note that if ``render_mode != "rgb_array"``, you can only call ``vec_env.render()`` (without argument or with ``mode=env.render_mode``). + +- the ``reset()`` method doesn't take any parameter. If you want to seed the pseudo-random generator, + you should call ``vec_env.seed(seed=seed)`` and ``obs = vec_env.reset()`` afterward. + +- methods and attributes of the underlying Gym envs can be accessed, called and set using ``vec_env.get_attr("attribute_name")``, + ``vec_env.env_method("method_name", args1, args2, kwargs1=kwargs1)`` and ``vec_env.set_attr("attribute_name", new_value)``. + + Vectorized Environments Wrappers -------------------------------- diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 4ae5e471d..5f2d0b760 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,16 +3,29 @@ Changelog ========== -Release 1.8.1a0 (WIP) +Release 2.0.0a4 (WIP) -------------------------- +**Gymnasium support** + +.. warning:: + + Stable-Baselines3 (SB3) v2.0 will be the last one supporting python 3.7 (end of life in June 2023). + We highly recommended you to upgrade to Python >= 3.8. + + Breaking Changes: ^^^^^^^^^^^^^^^^^ +- Switched to Gymnasium as primary backend, Gym 0.21 and 0.26 are still supported via the ``shimmy`` package (@carlosluis, @arjun-kg, @tlpss) +- The deprecated ``online_sampling`` argument of ``HerReplayBuffer`` was removed +- Removed deprecated ``stack_observation_space`` method of ``StackedObservations`` - Renamed environment output observations in ``evaluate_policy`` to prevent shadowing the input observations during callbacks (@npit) +- Upgraded wrappers and custom environment to Gymnasium New Features: ^^^^^^^^^^^^^ + `SB3-Contrib`_ ^^^^^^^^^^^^^^ @@ -28,9 +41,22 @@ Deprecations: Others: ^^^^^^^ +- Fixed ``stable_baselines3/a2c/*.py`` type hints +- Fixed ``stable_baselines3/ppo/*.py`` type hints +- Fixed ``stable_baselines3/sac/*.py`` type hints +- Fixed ``stable_baselines3/td3/*.py`` type hints +- Fixed ``stable_baselines3/common/base_class.py`` type hints +- Upgraded docker images to use mamba/micromamba and CUDA 11.7 +- Updated env checker to reflect what subset of Gymnasium is supported and improve GoalEnv checks +- Improve type annotation of wrappers +- Tests envs are now checked too +- Added render test for ``VecEnv`` Documentation: ^^^^^^^^^^^^^^ +- Added documentation about ``VecEnv`` API vs Gym API +- Upgraded tutorials to Gymnasium API +- Make it more explicit when using ``VecEnv`` vs Gym env Release 1.8.0 (2023-04-07) @@ -328,6 +354,7 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ + `SB3-Contrib`_ ^^^^^^^^^^^^^^ - Added Recurrent PPO (PPO LSTM). See https://github.com/Stable-Baselines-Team/stable-baselines3-contrib/pull/53 @@ -373,7 +400,7 @@ Release 1.5.0 (2022-03-25) Breaking Changes: ^^^^^^^^^^^^^^^^^ -- Switched minimum Gym version to 0.21.0. +- Switched minimum Gym version to 0.21.0 New Features: ^^^^^^^^^^^^^ @@ -1298,6 +1325,7 @@ And all the contributors: @eleurent @ac-93 @cove9988 @theDebugger811 @hsuehch @Demetrio92 @thomasgubler @IperGiove @ScheiklP @simoninithomas @armandpl @manuel-delverme @Gautam-J @gianlucadecola @buoyancy99 @caburu @xy9485 @Gregwar @ycheng517 @quantitative-technologies @bcollazo @git-thor @TibiGG @cool-RR @MWeltevrede -@Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875 @yuanmingqi +@carlosluis @arjun-kg @tlpss +@Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875 @anand-bala @hughperkins @sidney-tio @AlexPasqua @dominicgkerr @Akhilez @Rocamonde @tobirohrer @ZikangXiong -@DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO @jonasreiher @npit @WeberSamuel \ No newline at end of file +@DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO @jonasreiher @npit @WeberSamuel diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst index 670da617c..84a94eaab 100644 --- a/docs/modules/a2c.rst +++ b/docs/modules/a2c.rst @@ -53,7 +53,7 @@ Train a A2C agent on ``CartPole-v1`` using 4 environments. .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import A2C from stable_baselines3.common.env_util import make_vec_env diff --git a/docs/modules/ddpg.rst b/docs/modules/ddpg.rst index c484a1c93..4ac28ccb3 100644 --- a/docs/modules/ddpg.rst +++ b/docs/modules/ddpg.rst @@ -61,7 +61,7 @@ This example is only to demonstrate the use of the library and its functions, an .. code-block:: python - import gym + import gymnasium as gym import numpy as np from stable_baselines3 import DDPG diff --git a/docs/modules/dqn.rst b/docs/modules/dqn.rst index 8648606cc..0569aa528 100644 --- a/docs/modules/dqn.rst +++ b/docs/modules/dqn.rst @@ -56,7 +56,7 @@ This example is only to demonstrate the use of the library and its functions, an .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import DQN diff --git a/docs/modules/ppo.rst b/docs/modules/ppo.rst index d0c425fb5..a822cb436 100644 --- a/docs/modules/ppo.rst +++ b/docs/modules/ppo.rst @@ -65,7 +65,7 @@ Train a PPO agent on ``CartPole-v1`` using 4 environments. .. code-block:: python - import gym + import gymnasium as gym from stable_baselines3 import PPO from stable_baselines3.common.env_util import make_vec_env diff --git a/docs/modules/sac.rst b/docs/modules/sac.rst index e7f9057d5..0e9bb3f64 100644 --- a/docs/modules/sac.rst +++ b/docs/modules/sac.rst @@ -68,7 +68,7 @@ This example is only to demonstrate the use of the library and its functions, an .. code-block:: python - import gym + import gymnasium as gym import numpy as np from stable_baselines3 import SAC diff --git a/docs/modules/td3.rst b/docs/modules/td3.rst index d039ae71c..7c17e644d 100644 --- a/docs/modules/td3.rst +++ b/docs/modules/td3.rst @@ -61,7 +61,7 @@ This example is only to demonstrate the use of the library and its functions, an .. code-block:: python - import gym + import gymnasium as gym import numpy as np from stable_baselines3 import TD3 diff --git a/pyproject.toml b/pyproject.toml index 2f98aa4f4..b44edf529 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,9 +35,7 @@ ignore_missing_imports = true follow_imports = "silent" show_error_codes = true exclude = """(?x)( - stable_baselines3/a2c/a2c.py$ - | stable_baselines3/common/base_class.py$ - | stable_baselines3/common/buffers.py$ + stable_baselines3/common/buffers.py$ | stable_baselines3/common/callbacks.py$ | stable_baselines3/common/distributions.py$ | stable_baselines3/common/envs/bit_flipping_env.py$ @@ -45,7 +43,6 @@ exclude = """(?x)( | stable_baselines3/common/envs/multi_input_envs.py$ | stable_baselines3/common/logger.py$ | stable_baselines3/common/off_policy_algorithm.py$ - | stable_baselines3/common/on_policy_algorithm.py$ | stable_baselines3/common/policies.py$ | stable_baselines3/common/save_util.py$ | stable_baselines3/common/sb2_compat/rmsprop_tf_like.py$ @@ -62,11 +59,6 @@ exclude = """(?x)( | stable_baselines3/common/vec_env/vec_transpose.py$ | stable_baselines3/common/vec_env/vec_video_recorder.py$ | stable_baselines3/her/her_replay_buffer.py$ - | stable_baselines3/ppo/ppo.py$ - | stable_baselines3/sac/policies.py$ - | stable_baselines3/sac/sac.py$ - | stable_baselines3/td3/policies.py$ - | stable_baselines3/td3/td3.py$ | tests/test_logger.py$ | tests/test_train_eval_mode.py$ )""" @@ -80,12 +72,8 @@ env = [ filterwarnings = [ # Tensorboard warnings "ignore::DeprecationWarning:tensorboard", - # Gym warnings - "ignore:Parameters to load are deprecated.:DeprecationWarning", - "ignore:the imp module is deprecated in favour of importlib:PendingDeprecationWarning", - "ignore::UserWarning:gym", - "ignore:SelectableGroups dict interface is deprecated.:DeprecationWarning", - "ignore:`np.bool` is a deprecated alias for the builtin `bool`:DeprecationWarning", + # Gymnasium warnings + "ignore::UserWarning:gymnasium", ] markers = [ "expensive: marks tests as expensive (deselect with '-m \"not expensive\"')" diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index 13ac86b17..c1a4a5608 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -1,14 +1,14 @@ #!/bin/bash -CPU_PARENT=ubuntu:18.04 -GPU_PARENT=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 +CPU_PARENT=mambaorg/micromamba:1.4-kinetic +GPU_PARENT=mambaorg/micromamba:1.4.1-focal-cuda-11.7.1 TAG=stablebaselines/stable-baselines3 VERSION=$(cat ./stable_baselines3/version.txt) if [[ ${USE_GPU} == "True" ]]; then PARENT=${GPU_PARENT} - PYTORCH_DEPS="cudatoolkit=10.1" + PYTORCH_DEPS="pytorch-cuda=11.7" else PARENT=${CPU_PARENT} PYTORCH_DEPS="cpuonly" diff --git a/scripts/run_docker_cpu.sh b/scripts/run_docker_cpu.sh index 6dfafd2b9..db6c6493b 100755 --- a/scripts/run_docker_cpu.sh +++ b/scripts/run_docker_cpu.sh @@ -7,5 +7,5 @@ echo "Executing in the docker (cpu image):" echo $cmd_line docker run -it --rm --network host --ipc=host \ - --mount src=$(pwd),target=/root/code/stable-baselines3,type=bind stablebaselines/stable-baselines3-cpu:latest \ - bash -c "cd /root/code/stable-baselines3/ && $cmd_line" + --mount src=$(pwd),target=/home/mamba/stable-baselines3,type=bind stablebaselines/stable-baselines3-cpu:latest \ + bash -c "cd /home/mamba/stable-baselines3/ && $cmd_line" diff --git a/scripts/run_docker_gpu.sh b/scripts/run_docker_gpu.sh index 19e16067a..fa8aae9c4 100755 --- a/scripts/run_docker_gpu.sh +++ b/scripts/run_docker_gpu.sh @@ -15,5 +15,5 @@ else fi docker run -it ${NVIDIA_ARG} --rm --network host --ipc=host \ - --mount src=$(pwd),target=/root/code/stable-baselines3,type=bind stablebaselines/stable-baselines3:latest \ - bash -c "cd /root/code/stable-baselines3/ && $cmd_line" + --mount src=$(pwd),target=/home/mamba/stable-baselines3,type=bind stablebaselines/stable-baselines3:latest \ + bash -c "cd /home/mamba/stable-baselines3/ && $cmd_line" diff --git a/setup.py b/setup.py index dd7b69637..a7a3fcc8b 100644 --- a/setup.py +++ b/setup.py @@ -39,11 +39,11 @@ Here is a quick example of how to train and run PPO on a cartpole environment: ```python -import gym +import gymnasium from stable_baselines3 import PPO -env = gym.make("CartPole-v1") +env = gymnasium.make("CartPole-v1") model = PPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10_000) @@ -60,7 +60,7 @@ ``` -Or just train a model with a one liner if [the environment is registered in Gym](https://www.gymlibrary.ml/content/environment_creation/) and if [the policy is registered](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html): +Or just train a model with a one liner if [the environment is registered in Gymnasium](https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/) and if [the policy is registered](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html): ```python from stable_baselines3 import PPO @@ -76,6 +76,9 @@ extra_no_roms = [ # For render "opencv-python", + 'pygame; python_version >= "3.8.0"', + # See https://github.com/pygame/pygame/issues/3572 + 'pygame>=2.0,<2.1.3; python_version < "3.8.0"', # Tensorboard support "tensorboard>=2.9.1", # Checking memory taken by replay buffer @@ -84,7 +87,7 @@ "tqdm", "rich", # For atari games, - "ale-py==0.7.4", + "shimmy[atari]~=0.2.1", "pillow", ] @@ -99,7 +102,7 @@ packages=[package for package in find_packages() if package.startswith("stable_baselines3")], package_data={"stable_baselines3": ["py.typed", "version.txt"]}, install_requires=[ - "gym==0.21", # Fixed version due to breaking changes in 0.22 + "gymnasium==0.28.1", "numpy", "torch>=1.11", 'typing_extensions>=4.0,<5; python_version < "3.8.0"', @@ -109,8 +112,6 @@ "pandas", # Plotting learning curves "matplotlib", - # gym not compatible with importlib-metadata>5.0 - "importlib-metadata~=4.13", ], extras_require={ "tests": [ @@ -128,8 +129,6 @@ "isort>=5.0", # Reformat "black", - # For toy text Gym envs - "scipy>=1.4.1", ], "docs": [ "sphinx", diff --git a/stable_baselines3/__init__.py b/stable_baselines3/__init__.py index 680e25453..0775a8ec5 100644 --- a/stable_baselines3/__init__.py +++ b/stable_baselines3/__init__.py @@ -1,7 +1,5 @@ import os -import numpy as np - from stable_baselines3.a2c import A2C from stable_baselines3.common.utils import get_system_info from stable_baselines3.ddpg import DDPG @@ -11,10 +9,6 @@ from stable_baselines3.sac import SAC from stable_baselines3.td3 import TD3 -# Small monkey patch so gym 0.21 is compatible with numpy >= 1.24 -# TODO: remove when upgrading to gym 0.26 -np.bool = bool # type: ignore[attr-defined] - # Read version from file version_file = os.path.join(os.path.dirname(__file__), "version.txt") with open(version_file) as file_handler: diff --git a/stable_baselines3/a2c/a2c.py b/stable_baselines3/a2c/a2c.py index 9ecde5b85..996f17c19 100644 --- a/stable_baselines3/a2c/a2c.py +++ b/stable_baselines3/a2c/a2c.py @@ -1,7 +1,7 @@ from typing import Any, Dict, Optional, Type, TypeVar, Union import torch as th -from gym import spaces +from gymnasium import spaces from torch.nn import functional as F from stable_baselines3.common.on_policy_algorithm import OnPolicyAlgorithm diff --git a/stable_baselines3/common/atari_wrappers.py b/stable_baselines3/common/atari_wrappers.py index ad29a3142..ea78c5b7c 100644 --- a/stable_baselines3/common/atari_wrappers.py +++ b/stable_baselines3/common/atari_wrappers.py @@ -1,6 +1,10 @@ -import gym +from typing import Dict, SupportsFloat + +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces + +from stable_baselines3.common.type_aliases import AtariResetReturn, AtariStepReturn try: import cv2 # pytype:disable=import-error @@ -9,10 +13,8 @@ except ImportError: cv2 = None -from stable_baselines3.common.type_aliases import GymObs, GymStepReturn - -class StickyActionEnv(gym.Wrapper): +class StickyActionEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]): """ Sticky action. @@ -26,19 +28,19 @@ class StickyActionEnv(gym.Wrapper): def __init__(self, env: gym.Env, action_repeat_probability: float) -> None: super().__init__(env) self.action_repeat_probability = action_repeat_probability - assert env.unwrapped.get_action_meanings()[0] == "NOOP" + assert env.unwrapped.get_action_meanings()[0] == "NOOP" # type: ignore[attr-defined] - def reset(self, **kwargs) -> GymObs: + def reset(self, **kwargs) -> AtariResetReturn: self._sticky_action = 0 # NOOP return self.env.reset(**kwargs) - def step(self, action: int) -> GymStepReturn: + def step(self, action: int) -> AtariStepReturn: if self.np_random.random() >= self.action_repeat_probability: self._sticky_action = action return self.env.step(self._sticky_action) -class NoopResetEnv(gym.Wrapper): +class NoopResetEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]): """ Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. @@ -52,24 +54,25 @@ def __init__(self, env: gym.Env, noop_max: int = 30) -> None: self.noop_max = noop_max self.override_num_noops = None self.noop_action = 0 - assert env.unwrapped.get_action_meanings()[0] == "NOOP" + assert env.unwrapped.get_action_meanings()[0] == "NOOP" # type: ignore[attr-defined] - def reset(self, **kwargs) -> np.ndarray: + def reset(self, **kwargs) -> AtariResetReturn: self.env.reset(**kwargs) if self.override_num_noops is not None: noops = self.override_num_noops else: - noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) + noops = self.unwrapped.np_random.integers(1, self.noop_max + 1) assert noops > 0 obs = np.zeros(0) + info: Dict = {} for _ in range(noops): - obs, _, done, _ = self.env.step(self.noop_action) - if done: - obs = self.env.reset(**kwargs) - return obs + obs, _, terminated, truncated, info = self.env.step(self.noop_action) + if terminated or truncated: + obs, info = self.env.reset(**kwargs) + return obs, info -class FireResetEnv(gym.Wrapper): +class FireResetEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]): """ Take action on reset for environments that are fixed until firing. @@ -78,21 +81,21 @@ class FireResetEnv(gym.Wrapper): def __init__(self, env: gym.Env) -> None: super().__init__(env) - assert env.unwrapped.get_action_meanings()[1] == "FIRE" - assert len(env.unwrapped.get_action_meanings()) >= 3 + assert env.unwrapped.get_action_meanings()[1] == "FIRE" # type: ignore[attr-defined] + assert len(env.unwrapped.get_action_meanings()) >= 3 # type: ignore[attr-defined] - def reset(self, **kwargs) -> np.ndarray: + def reset(self, **kwargs) -> AtariResetReturn: self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: + obs, _, terminated, truncated, _ = self.env.step(1) + if terminated or truncated: self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: + obs, _, terminated, truncated, _ = self.env.step(2) + if terminated or truncated: self.env.reset(**kwargs) - return obs + return obs, {} -class EpisodicLifeEnv(gym.Wrapper): +class EpisodicLifeEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]): """ Make end-of-life == end-of-episode, but only reset on true game over. Done by DeepMind for the DQN and co. since it helps value estimation. @@ -105,21 +108,21 @@ def __init__(self, env: gym.Env) -> None: self.lives = 0 self.was_real_done = True - def step(self, action: int) -> GymStepReturn: - obs, reward, done, info = self.env.step(action) - self.was_real_done = done + def step(self, action: int) -> AtariStepReturn: + obs, reward, terminated, truncated, info = self.env.step(action) + self.was_real_done = terminated or truncated # check current lives, make loss of life terminal, # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() + lives = self.env.unwrapped.ale.lives() # type: ignore[attr-defined] if 0 < lives < self.lives: # for Qbert sometimes we stay in lives == 0 condition for a few frames # so its important to keep lives > 0, so that we only reset once # the environment advertises done. - done = True + terminated = True self.lives = lives - return obs, reward, done, info + return obs, reward, terminated, truncated, info - def reset(self, **kwargs) -> np.ndarray: + def reset(self, **kwargs) -> AtariResetReturn: """ Calls the Gym environment reset, only when lives are exhausted. This way all states are still reachable even though lives are episodic, @@ -129,21 +132,21 @@ def reset(self, **kwargs) -> np.ndarray: :return: the first observation of the environment """ if self.was_real_done: - obs = self.env.reset(**kwargs) + obs, info = self.env.reset(**kwargs) else: # no-op step to advance from terminal/lost life state - obs, _, done, _ = self.env.step(0) + obs, _, terminated, truncated, info = self.env.step(0) # The no-op step can lead to a game over, so we need to check it again # to see if we should reset the environment and avoid the # monitor.py `RuntimeError: Tried to step environment that needs reset` - if done: - obs = self.env.reset(**kwargs) - self.lives = self.env.unwrapped.ale.lives() - return obs + if terminated or truncated: + obs, info = self.env.reset(**kwargs) + self.lives = self.env.unwrapped.ale.lives() # type: ignore[attr-defined] + return obs, info -class MaxAndSkipEnv(gym.Wrapper): +class MaxAndSkipEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]): """ Return only every ``skip``-th frame (frameskipping) and return the max between the two last frames. @@ -156,33 +159,36 @@ class MaxAndSkipEnv(gym.Wrapper): def __init__(self, env: gym.Env, skip: int = 4) -> None: super().__init__(env) # most recent raw observations (for max pooling across time steps) + assert env.observation_space.dtype is not None, "No dtype specified for the observation space" + assert env.observation_space.shape is not None, "No shape defined for the observation space" self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype) self._skip = skip - def step(self, action: int) -> GymStepReturn: + def step(self, action: int) -> AtariStepReturn: """ Step the environment with the given action Repeat action, sum reward, and max over last observations. :param action: the action - :return: observation, reward, done, information + :return: observation, reward, terminated, truncated, information """ total_reward = 0.0 - done = False + terminated = truncated = False for i in range(self._skip): - obs, reward, done, info = self.env.step(action) + obs, reward, terminated, truncated, info = self.env.step(action) + done = terminated or truncated if i == self._skip - 2: self._obs_buffer[0] = obs if i == self._skip - 1: self._obs_buffer[1] = obs - total_reward += reward + total_reward += float(reward) if done: break # Note that the observation on the done=True frame # doesn't matter max_frame = self._obs_buffer.max(axis=0) - return max_frame, total_reward, done, info + return max_frame, total_reward, terminated, truncated, info class ClipRewardEnv(gym.RewardWrapper): @@ -195,17 +201,17 @@ class ClipRewardEnv(gym.RewardWrapper): def __init__(self, env: gym.Env) -> None: super().__init__(env) - def reward(self, reward: float) -> float: + def reward(self, reward: SupportsFloat) -> float: """ Bin reward to {+1, 0, -1} by its sign. :param reward: :return: """ - return np.sign(reward) + return np.sign(float(reward)) -class WarpFrame(gym.ObservationWrapper): +class WarpFrame(gym.ObservationWrapper[np.ndarray, int, np.ndarray]): """ Convert to grayscale and warp frames to 84x84 (default) as done in the Nature paper and later work. @@ -219,8 +225,13 @@ def __init__(self, env: gym.Env, width: int = 84, height: int = 84) -> None: super().__init__(env) self.width = width self.height = height + assert isinstance(env.observation_space, spaces.Box), f"Expected Box space, got {env.observation_space}" + self.observation_space = spaces.Box( - low=0, high=255, shape=(self.height, self.width, 1), dtype=env.observation_space.dtype + low=0, + high=255, + shape=(self.height, self.width, 1), + dtype=env.observation_space.dtype, # type: ignore[arg-type] ) def observation(self, frame: np.ndarray) -> np.ndarray: @@ -235,7 +246,7 @@ def observation(self, frame: np.ndarray) -> np.ndarray: return frame[:, :, None] -class AtariWrapper(gym.Wrapper): +class AtariWrapper(gym.Wrapper[np.ndarray, int, np.ndarray, int]): """ Atari 2600 preprocessings @@ -285,7 +296,7 @@ def __init__( env = MaxAndSkipEnv(env, skip=frame_skip) if terminal_on_life_loss: env = EpisodicLifeEnv(env) - if "FIRE" in env.unwrapped.get_action_meanings(): + if "FIRE" in env.unwrapped.get_action_meanings(): # type: ignore[attr-defined] env = FireResetEnv(env) env = WarpFrame(env, width=screen_size, height=screen_size) if clip_reward: diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py index bec075699..ece511a24 100644 --- a/stable_baselines3/common/base_class.py +++ b/stable_baselines3/common/base_class.py @@ -8,10 +8,10 @@ from collections import deque from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union -import gym +import gymnasium as gym import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3.common import utils from stable_baselines3.common.callbacks import BaseCallback, CallbackList, ConvertCallback, ProgressBarCallback @@ -22,7 +22,7 @@ from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.preprocessing import check_for_nested_spaces, is_image_space, is_image_space_channels_first from stable_baselines3.common.save_util import load_from_zip_file, recursive_getattr, recursive_setattr, save_to_zip_file -from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule +from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule, TensorDict from stable_baselines3.common.utils import ( check_for_correct_spaces, get_device, @@ -39,11 +39,12 @@ is_vecenv_wrapped, unwrap_vec_normalize, ) +from stable_baselines3.common.vec_env.patch_gym import _convert_space, _patch_env SelfBaseAlgorithm = TypeVar("SelfBaseAlgorithm", bound="BaseAlgorithm") -def maybe_make_env(env: Union[GymEnv, str, None], verbose: int) -> Optional[GymEnv]: +def maybe_make_env(env: Union[GymEnv, str], verbose: int) -> GymEnv: """If env is a string, make the environment; otherwise, return env. :param env: The environment to learn from. @@ -51,9 +52,14 @@ def maybe_make_env(env: Union[GymEnv, str, None], verbose: int) -> Optional[GymE :return A Gym (vector) environment. """ if isinstance(env, str): + env_id = env if verbose >= 1: - print(f"Creating environment from the given name '{env}'") - env = gym.make(env) + print(f"Creating environment from the given name '{env_id}'") + # Set render_mode to `rgb_array` as default, so we can record video + try: + env = gym.make(env_id, render_mode="rgb_array") + except TypeError: + env = gym.make(env_id) return env @@ -90,6 +96,11 @@ class BaseAlgorithm(ABC): # Policy aliases (see _get_policy_from_name()) policy_aliases: Dict[str, Type[BasePolicy]] = {} policy: BasePolicy + observation_space: spaces.Space + action_space: spaces.Space + n_envs: int + lr_schedule: Schedule + _logger: Logger def __init__( self, @@ -106,8 +117,8 @@ def __init__( seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, - supported_action_spaces: Optional[Tuple[spaces.Space, ...]] = None, - ): + supported_action_spaces: Optional[Tuple[Type[spaces.Space], ...]] = None, + ) -> None: if isinstance(policy, str): self.policy_class = self._get_policy_from_name(policy) else: @@ -117,14 +128,9 @@ def __init__( if verbose >= 1: print(f"Using {self.device} device") - self.env = None # type: Optional[GymEnv] - # get VecNormalize object if needed - self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs - self.observation_space: spaces.Space - self.action_space: spaces.Space - self.n_envs: int + self.num_timesteps = 0 # Used for updating schedules self._total_timesteps = 0 @@ -132,10 +138,9 @@ def __init__( self._num_timesteps_at_start = 0 self.seed = seed self.action_noise: Optional[ActionNoise] = None - self.start_time = None + self.start_time = 0.0 self.learning_rate = learning_rate self.tensorboard_log = tensorboard_log - self.lr_schedule = None # type: Optional[Schedule] self._last_obs = None # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] self._last_episode_starts = None # type: Optional[np.ndarray] # When using VecNormalize: @@ -146,17 +151,17 @@ def __init__( self.sde_sample_freq = sde_sample_freq # Track the training progress remaining (from 1 to 0) # this is used to update the learning rate - self._current_progress_remaining = 1 + self._current_progress_remaining = 1.0 # Buffers for logging self._stats_window_size = stats_window_size self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging (and TD3 delayed updates) self._n_updates = 0 # type: int - # The logger object - self._logger = None # type: Logger # Whether the user passed a custom logger or not self._custom_logger = False + self.env: Optional[VecEnv] = None + self._vec_normalize_env: Optional[VecNormalize] = None # Create and wrap the env if needed if env is not None: @@ -168,6 +173,9 @@ def __init__( self.n_envs = env.num_envs self.env = env + # get VecNormalize object if needed + self._vec_normalize_env = unwrap_vec_normalize(env) + if supported_action_spaces is not None: assert isinstance(self.action_space, supported_action_spaces), ( f"The algorithm only supports {supported_action_spaces} as action spaces " @@ -204,13 +212,15 @@ def _wrap_env(env: GymEnv, verbose: int = 0, monitor_wrapper: bool = True) -> Ve :return: The wrapped environment. """ if not isinstance(env, VecEnv): + # Patch to support gym 0.21/0.26 and gymnasium + env = _patch_env(env) if not is_wrapped(env, Monitor) and monitor_wrapper: if verbose >= 1: print("Wrapping the env with a `Monitor` wrapper") env = Monitor(env) if verbose >= 1: print("Wrapping the env in a DummyVecEnv.") - env = DummyVecEnv([lambda: env]) + env = DummyVecEnv([lambda: env]) # type: ignore[list-item, return-value] # Make sure that dict-spaces are not nested (not supported) check_for_nested_spaces(env.observation_space) @@ -223,11 +233,11 @@ def _wrap_env(env: GymEnv, verbose: int = 0, monitor_wrapper: bool = True) -> Ve # the other channel last), VecTransposeImage will throw an error for space in env.observation_space.spaces.values(): wrap_with_vectranspose = wrap_with_vectranspose or ( - is_image_space(space) and not is_image_space_channels_first(space) + is_image_space(space) and not is_image_space_channels_first(space) # type: ignore[arg-type] ) else: wrap_with_vectranspose = is_image_space(env.observation_space) and not is_image_space_channels_first( - env.observation_space + env.observation_space # type: ignore[arg-type] ) if wrap_with_vectranspose: @@ -409,7 +419,10 @@ def _setup_learn( # Avoid resetting the environment when calling ``.learn()`` consecutive times if reset_num_timesteps or self._last_obs is None: - self._last_obs = self.env.reset() # pytype: disable=annotation-type-mismatch + assert self.env is not None + # pytype: disable=annotation-type-mismatch + self._last_obs = self.env.reset() # type: ignore[assignment] + # pytype: enable=annotation-type-mismatch self._last_episode_starts = np.ones((self.env.num_envs,), dtype=bool) # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: @@ -432,6 +445,9 @@ def _update_info_buffer(self, infos: List[Dict[str, Any]], dones: Optional[np.nd :param infos: List of additional information about the transition. :param dones: Termination signals """ + assert self.ep_info_buffer is not None + assert self.ep_success_buffer is not None + if dones is None: dones = np.array([False] * len(infos)) for idx, info in enumerate(infos): @@ -555,7 +571,7 @@ def set_random_seed(self, seed: Optional[int] = None) -> None: def set_parameters( self, - load_path_or_dict: Union[str, Dict[str, Dict]], + load_path_or_dict: Union[str, TensorDict], exact_match: bool = True, device: Union[th.device, str] = "auto", ) -> None: @@ -571,7 +587,7 @@ def set_parameters( can be used to update only specific parameters. :param device: Device on which the code should run. """ - params = None + params = {} if isinstance(load_path_or_dict, dict): params = load_path_or_dict else: @@ -609,7 +625,7 @@ def set_parameters( # # Solution: Just load the state-dict as is, and trust # the user has provided a sensible state dictionary. - attr.load_state_dict(params[name]) + attr.load_state_dict(params[name]) # type: ignore[arg-type] else: # Assume attr is th.nn.Module attr.load_state_dict(params[name], strict=exact_match) @@ -667,6 +683,9 @@ def load( # noqa: C901 print_system_info=print_system_info, ) + assert data is not None, "No data found in the saved file" + assert params is not None, "No params found in the saved file" + # Remove stored device information and replace with ours if "policy_kwargs" in data: if "device" in data["policy_kwargs"]: @@ -686,6 +705,10 @@ def load( # noqa: C901 if "observation_space" not in data or "action_space" not in data: raise KeyError("The observation_space and action_space were not given, can't verify new environments") + # Gym -> Gymnasium space conversion + for key in {"observation_space", "action_space"}: + data[key] = _convert_space(data[key]) # pytype: disable=unsupported-operands + if env is not None: # Wrap first if needed env = cls._wrap_env(env, data["verbose"]) @@ -703,13 +726,14 @@ def load( # noqa: C901 if "env" in data: env = data["env"] - # noinspection PyArgumentList - model = cls( # pytype: disable=not-instantiable,wrong-keyword-args + # pytype: disable=not-instantiable,wrong-keyword-args + model = cls( policy=data["policy_class"], env=env, device=device, - _init_setup_model=False, # pytype: disable=not-instantiable,wrong-keyword-args + _init_setup_model=False, # type: ignore[call-arg] ) + # pytype: enable=not-instantiable,wrong-keyword-args # load parameters model.__dict__.update(data) @@ -747,12 +771,12 @@ def load( # noqa: C901 continue # Set the data attribute directly to avoid issue when using optimizers # See https://github.com/DLR-RM/stable-baselines3/issues/391 - recursive_setattr(model, name + ".data", pytorch_variables[name].data) + recursive_setattr(model, f"{name}.data", pytorch_variables[name].data) # Sample gSDE exploration matrix, so it uses the right device # see issue #44 if model.use_sde: - model.policy.reset_noise() # pytype: disable=attribute-error + model.policy.reset_noise() # type: ignore[operator] # pytype: disable=attribute-error return model def get_parameters(self) -> Dict[str, Dict]: diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py index 95a9c1e86..e52f08f69 100644 --- a/stable_baselines3/common/buffers.py +++ b/stable_baselines3/common/buffers.py @@ -4,7 +4,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.preprocessing import get_action_dim, get_obs_shape from stable_baselines3.common.type_aliases import ( @@ -335,6 +335,15 @@ class RolloutBuffer(BaseBuffer): :param n_envs: Number of parallel environments """ + observations: np.ndarray + actions: np.ndarray + rewards: np.ndarray + advantages: np.ndarray + returns: np.ndarray + episode_starts: np.ndarray + log_probs: np.ndarray + values: np.ndarray + def __init__( self, buffer_size: int, @@ -348,8 +357,6 @@ def __init__( super().__init__(buffer_size, observation_space, action_space, device, n_envs=n_envs) self.gae_lambda = gae_lambda self.gamma = gamma - self.observations, self.actions, self.rewards, self.advantages = None, None, None, None - self.returns, self.episode_starts, self.values, self.log_probs = None, None, None, None self.generator_ready = False self.reset() diff --git a/stable_baselines3/common/callbacks.py b/stable_baselines3/common/callbacks.py index 28d07e006..c9b8a3367 100644 --- a/stable_baselines3/common/callbacks.py +++ b/stable_baselines3/common/callbacks.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Union -import gym +import gymnasium as gym import numpy as np from stable_baselines3.common.logger import Logger @@ -313,7 +313,7 @@ class ConvertCallback(BaseCallback): :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages """ - def __init__(self, callback: Callable[[Dict[str, Any], Dict[str, Any]], bool], verbose: int = 0): + def __init__(self, callback: Optional[Callable[[Dict[str, Any], Dict[str, Any]], bool]], verbose: int = 0): super().__init__(verbose) self.callback = callback diff --git a/stable_baselines3/common/distributions.py b/stable_baselines3/common/distributions.py index 2b942e8f8..8a8e9f903 100644 --- a/stable_baselines3/common/distributions.py +++ b/stable_baselines3/common/distributions.py @@ -5,7 +5,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch import nn from torch.distributions import Bernoulli, Categorical, Normal diff --git a/stable_baselines3/common/env_checker.py b/stable_baselines3/common/env_checker.py index b71454b1c..cc8be48ef 100644 --- a/stable_baselines3/common/env_checker.py +++ b/stable_baselines3/common/env_checker.py @@ -1,9 +1,9 @@ import warnings from typing import Any, Dict, Union -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.preprocessing import check_for_nested_spaces, is_image_space_channels_first from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan @@ -60,9 +60,15 @@ def _check_unsupported_spaces(env: gym.Env, observation_space: spaces.Space, act if isinstance(observation_space, spaces.Dict): nested_dict = False - for space in observation_space.spaces.values(): + for key, space in observation_space.spaces.items(): if isinstance(space, spaces.Dict): nested_dict = True + if isinstance(space, spaces.Discrete) and space.start != 0: + warnings.warn( + f"Discrete observation space (key '{key}') with a non-zero start is not supported by Stable-Baselines3. " + "You can use a wrapper or update your observation space." + ) + if nested_dict: warnings.warn( "Nested observation spaces are not supported by Stable Baselines3 " @@ -81,6 +87,18 @@ def _check_unsupported_spaces(env: gym.Env, observation_space: spaces.Space, act "which is supported by SB3." ) + if isinstance(observation_space, spaces.Discrete) and observation_space.start != 0: + warnings.warn( + "Discrete observation space with a non-zero start is not supported by Stable-Baselines3. " + "You can use a wrapper or update your observation space." + ) + + if isinstance(action_space, spaces.Discrete) and action_space.start != 0: + warnings.warn( + "Discrete action space with a non-zero start is not supported by Stable-Baselines3. " + "You can use a wrapper or update your action space." + ) + if not _is_numpy_array_space(action_space): warnings.warn( "The action space is not based off a numpy array. Typically this means it's either a Dict or Tuple space. " @@ -101,9 +119,8 @@ def _is_goal_env(env: gym.Env) -> bool: """ Check if the env uses the convention for goal-conditioned envs (previously, the gym.GoalEnv interface) """ - if isinstance(env, gym.Wrapper): # We need to unwrap the env since gym.Wrapper has the compute_reward method - return _is_goal_env(env.unwrapped) - return hasattr(env, "compute_reward") + # We need to unwrap the env since gym.Wrapper has the compute_reward method + return hasattr(env.unwrapped, "compute_reward") def _check_goal_env_obs(obs: dict, observation_space: spaces.Dict, method_name: str) -> None: @@ -131,7 +148,7 @@ def _check_goal_env_compute_reward( env: gym.Env, reward: float, info: Dict[str, Any], -): +) -> None: """ Check that reward is computed with `compute_reward` and that the implementation is vectorized. @@ -165,7 +182,9 @@ def _check_obs(obs: Union[tuple, dict, np.ndarray, int], observation_space: spac # The check for a GoalEnv is done by the base class if isinstance(observation_space, spaces.Discrete): - assert isinstance(obs, int), f"The observation returned by `{method_name}()` method must be an int" + # Since https://github.com/Farama-Foundation/Gymnasium/pull/141, + # `sample()` will return a np.int64 instead of an int + assert np.issubdtype(type(obs), np.integer), f"The observation returned by `{method_name}()` method must be an int" elif _is_numpy_array_space(observation_space): assert isinstance(obs, np.ndarray), f"The observation returned by `{method_name}()` method must be a numpy array" @@ -174,27 +193,32 @@ def _check_obs(obs: Union[tuple, dict, np.ndarray, int], observation_space: spac # check obs dimensions, dtype and bounds assert observation_space.shape == obs.shape, ( f"The observation returned by the `{method_name}()` method does not match the shape " - f"of the given observation space. Expected: {observation_space.shape}, actual shape: {obs.shape}" + f"of the given observation space {observation_space}. " + f"Expected: {observation_space.shape}, actual shape: {obs.shape}" ) - assert observation_space.dtype == obs.dtype, ( - f"The observation returned by the `{method_name}()` method does not match the data type " - f"of the given observation space. Expected: {observation_space.dtype}, actual dtype: {obs.dtype}" + assert np.can_cast(obs.dtype, observation_space.dtype), ( + f"The observation returned by the `{method_name}()` method does not match the data type (cannot cast) " + f"of the given observation space {observation_space}. " + f"Expected: {observation_space.dtype}, actual dtype: {obs.dtype}" ) if isinstance(observation_space, spaces.Box): assert np.all(obs >= observation_space.low), ( f"The observation returned by the `{method_name}()` method does not match the lower bound " - f"of the given observation space. Expected: obs >= {np.min(observation_space.low)}, " + f"of the given observation space {observation_space}." + f"Expected: obs >= {np.min(observation_space.low)}, " f"actual min value: {np.min(obs)} at index {np.argmin(obs)}" ) assert np.all(obs <= observation_space.high), ( f"The observation returned by the `{method_name}()` method does not match the upper bound " - f"of the given observation space. Expected: obs <= {np.max(observation_space.high)}, " + f"of the given observation space {observation_space}. " + f"Expected: obs <= {np.max(observation_space.high)}, " f"actual max value: {np.max(obs)} at index {np.argmax(obs)}" ) - assert observation_space.contains( - obs - ), f"The observation returned by the `{method_name}()` method does not match the given observation space" + assert observation_space.contains(obs), ( + f"The observation returned by the `{method_name}()` method " + f"does not match the given observation space {observation_space}" + ) def _check_box_obs(observation_space: spaces.Box, key: str = "") -> None: @@ -221,8 +245,12 @@ def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action """ Check the returned values by the env when calling `.reset()` or `.step()` methods. """ - # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists - obs = env.reset() + # because env inherits from gymnasium.Env, we assume that `reset()` and `step()` methods exists + reset_returns = env.reset() + assert isinstance(reset_returns, tuple), "`reset()` must return a tuple (obs, info)" + assert len(reset_returns) == 2, f"`reset()` must return a tuple of size 2 (obs, info), not {len(reset_returns)}" + obs, info = reset_returns + assert isinstance(info, dict), f"The second element of the tuple return by `reset()` must be a dictionary not {info}" if _is_goal_env(env): # Make mypy happy, already checked @@ -249,19 +277,24 @@ def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action action = action_space.sample() data = env.step(action) - assert len(data) == 4, "The `step()` method must return four values: obs, reward, done, info" + assert len(data) == 5, ( + "The `step()` method must return five values: " + f"obs, reward, terminated, truncated, info. Actual: {len(data)} values returned." + ) # Unpack - obs, reward, done, info = data + obs, reward, terminated, truncated, info = data - if _is_goal_env(env): - # Make mypy happy, already checked - assert isinstance(observation_space, spaces.Dict) - _check_goal_env_obs(obs, observation_space, "step") - _check_goal_env_compute_reward(obs, env, reward, info) - elif isinstance(observation_space, spaces.Dict): + if isinstance(observation_space, spaces.Dict): assert isinstance(obs, dict), "The observation returned by `step()` must be a dictionary" + # Additional checks for GoalEnvs + if _is_goal_env(env): + # Make mypy happy, already checked + assert isinstance(observation_space, spaces.Dict) + _check_goal_env_obs(obs, observation_space, "step") + _check_goal_env_compute_reward(obs, env, float(reward), info) + if not obs.keys() == observation_space.spaces.keys(): raise AssertionError( "The observation keys returned by `step()` must match the observation " @@ -279,11 +312,14 @@ def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action # We also allow int because the reward will be cast to float assert isinstance(reward, (float, int)), "The reward returned by `step()` must be a float" - assert isinstance(done, bool), "The `done` signal must be a boolean" + assert isinstance(terminated, bool), "The `terminated` signal must be a boolean" + assert isinstance(truncated, bool), "The `truncated` signal must be a boolean" assert isinstance(info, dict), "The `info` returned by `step()` must be a python dictionary" # Goal conditioned env if _is_goal_env(env): + # for mypy, env.unwrapped was checked by _is_goal_env() + assert hasattr(env, "compute_reward") assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info) @@ -299,8 +335,10 @@ def _check_spaces(env: gym.Env) -> None: assert hasattr(env, "observation_space"), "You must specify an observation space (cf gym.spaces)" + gym_spaces assert hasattr(env, "action_space"), "You must specify an action space (cf gym.spaces)" + gym_spaces - assert isinstance(env.observation_space, spaces.Space), "The observation space must inherit from gym.spaces" + gym_spaces - assert isinstance(env.action_space, spaces.Space), "The action space must inherit from gym.spaces" + gym_spaces + assert isinstance(env.observation_space, spaces.Space), ( + "The observation space must inherit from gymnasium.spaces" + gym_spaces + ) + assert isinstance(env.action_space, spaces.Space), "The action space must inherit from gymnasium.spaces" + gym_spaces if _is_goal_env(env): assert isinstance( @@ -309,9 +347,9 @@ def _check_spaces(env: gym.Env) -> None: # Check render cannot be covered by CI -def _check_render(env: gym.Env, warn: bool = True, headless: bool = False) -> None: # pragma: no cover +def _check_render(env: gym.Env, warn: bool = False) -> None: # pragma: no cover """ - Check the declared render modes and the `render()`/`close()` + Check the instantiated render mode (if any) by calling the `render()`/`close()` method of the environment. :param env: The environment to check @@ -319,24 +357,19 @@ def _check_render(env: gym.Env, warn: bool = True, headless: bool = False) -> No :param headless: Whether to disable render modes that require a graphical interface. False by default. """ - render_modes = env.metadata.get("render.modes") + render_modes = env.metadata.get("render_modes") if render_modes is None: if warn: warnings.warn( "No render modes was declared in the environment " - " (env.metadata['render.modes'] is None or not defined), " + "(env.metadata['render_modes'] is None or not defined), " "you may have trouble when calling `.render()`" ) - else: - # Don't check render mode that require a - # graphical interface (useful for CI) - if headless and "human" in render_modes: - render_modes.remove("human") - # Check all declared render modes - for render_mode in render_modes: - env.render(mode=render_mode) - env.close() + # Only check currrent render mode + if env.render_mode: + env.render() + env.close() def check_env(env: gym.Env, warn: bool = True, skip_render_check: bool = True) -> None: @@ -401,7 +434,7 @@ def check_env(env: gym.Env, warn: bool = True, skip_render_check: bool = True) - # ==== Check the render method and the declared render modes ==== if not skip_render_check: - _check_render(env, warn=warn) # pragma: no cover + _check_render(env, warn) # pragma: no cover try: check_for_nested_spaces(env.observation_space) diff --git a/stable_baselines3/common/env_util.py b/stable_baselines3/common/env_util.py index c85d1472b..c3b73909e 100644 --- a/stable_baselines3/common/env_util.py +++ b/stable_baselines3/common/env_util.py @@ -1,11 +1,13 @@ import os from typing import Any, Callable, Dict, Optional, Type, Union -import gym +import gymnasium as gym from stable_baselines3.common.atari_wrappers import AtariWrapper from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.utils import compat_gym_seed from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv +from stable_baselines3.common.vec_env.patch_gym import _patch_env def unwrap_wrapper(env: gym.Env, wrapper_class: Type[gym.Wrapper]) -> Optional[gym.Wrapper]: @@ -24,7 +26,7 @@ def unwrap_wrapper(env: gym.Env, wrapper_class: Type[gym.Wrapper]) -> Optional[g return None -def is_wrapped(env: Type[gym.Env], wrapper_class: Type[gym.Wrapper]) -> bool: +def is_wrapped(env: gym.Env, wrapper_class: Type[gym.Wrapper]) -> bool: """ Check if a given environment has been wrapped with a given wrapper. @@ -72,25 +74,40 @@ def make_vec_env( :param wrapper_kwargs: Keyword arguments to pass to the ``Wrapper`` class constructor. :return: The wrapped environment """ - env_kwargs = {} if env_kwargs is None else env_kwargs - vec_env_kwargs = {} if vec_env_kwargs is None else vec_env_kwargs - monitor_kwargs = {} if monitor_kwargs is None else monitor_kwargs - wrapper_kwargs = {} if wrapper_kwargs is None else wrapper_kwargs + env_kwargs = env_kwargs or {} + vec_env_kwargs = vec_env_kwargs or {} + monitor_kwargs = monitor_kwargs or {} + wrapper_kwargs = wrapper_kwargs or {} + assert vec_env_kwargs is not None # for mypy + + def make_env(rank: int) -> Callable[[], gym.Env]: + def _init() -> gym.Env: + # For type checker: + assert monitor_kwargs is not None + assert wrapper_kwargs is not None + assert env_kwargs is not None - def make_env(rank): - def _init(): if isinstance(env_id, str): - env = gym.make(env_id, **env_kwargs) + # if the render mode was not specified, we set it to `rgb_array` as default. + kwargs = {"render_mode": "rgb_array"} + kwargs.update(env_kwargs) + try: + env = gym.make(env_id, **kwargs) # type: ignore[arg-type] + except TypeError: + env = gym.make(env_id, **env_kwargs) else: env = env_id(**env_kwargs) + # Patch to support gym 0.21/0.26 and gymnasium + env = _patch_env(env) + if seed is not None: - env.seed(seed + rank) + compat_gym_seed(env, seed=seed + rank) env.action_space.seed(seed + rank) # Wrap the env in a Monitor wrapper # to have additional training information monitor_path = os.path.join(monitor_dir, str(rank)) if monitor_dir is not None else None # Create the monitor folder if needed - if monitor_path is not None: + if monitor_path is not None and monitor_dir is not None: os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, filename=monitor_path, **monitor_kwargs) # Optionally, wrap the environment with the provided wrapper diff --git a/stable_baselines3/common/envs/bit_flipping_env.py b/stable_baselines3/common/envs/bit_flipping_env.py index d6724c9cc..ec0de2bf9 100644 --- a/stable_baselines3/common/envs/bit_flipping_env.py +++ b/stable_baselines3/common/envs/bit_flipping_env.py @@ -1,9 +1,9 @@ from collections import OrderedDict -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np -from gym import Env, spaces -from gym.envs.registration import EnvSpec +from gymnasium import Env, spaces +from gymnasium.envs.registration import EnvSpec from stable_baselines3.common.type_aliases import GymStepReturn @@ -25,7 +25,7 @@ class BitFlippingEnv(Env): :param channel_first: Whether to use channel-first or last image. """ - spec = EnvSpec("BitFlippingEnv-v0") + spec = EnvSpec("BitFlippingEnv-v0", "no-entry-point") def __init__( self, @@ -96,7 +96,7 @@ def __init__( self.discrete_obs_space = discrete_obs_space self.image_obs_space = image_obs_space self.state = None - self.desired_goal = np.ones((n_bits,)) + self.desired_goal = np.ones((n_bits,), dtype=self.observation_space["desired_goal"].dtype) if max_steps is None: max_steps = n_bits self.max_steps = max_steps @@ -157,24 +157,34 @@ def _get_obs(self) -> Dict[str, Union[int, np.ndarray]]: ] ) - def reset(self) -> Dict[str, Union[int, np.ndarray]]: + def reset( + self, *, seed: Optional[int] = None, options: Optional[Dict] = None + ) -> Tuple[Dict[str, Union[int, np.ndarray]], Dict]: + if seed is not None: + self.obs_space.seed(seed) self.current_step = 0 self.state = self.obs_space.sample() - return self._get_obs() + return self._get_obs(), {} def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: + """ + Step into the env. + + :param action: + :return: + """ if self.continuous: self.state[action > 0] = 1 - self.state[action > 0] else: self.state[action] = 1 - self.state[action] obs = self._get_obs() reward = float(self.compute_reward(obs["achieved_goal"], obs["desired_goal"], None)) - done = reward == 0 + terminated = reward == 0 self.current_step += 1 # Episode terminate when we reached the goal or the max number of steps - info = {"is_success": done} - done = done or self.current_step >= self.max_steps - return obs, reward, done, info + info = {"is_success": terminated} + truncated = self.current_step >= self.max_steps + return obs, reward, terminated, truncated, info def compute_reward( self, achieved_goal: Union[int, np.ndarray], desired_goal: Union[int, np.ndarray], _info: Optional[Dict[str, Any]] diff --git a/stable_baselines3/common/envs/identity_env.py b/stable_baselines3/common/envs/identity_env.py index a8bed175a..99a664999 100644 --- a/stable_baselines3/common/envs/identity_env.py +++ b/stable_baselines3/common/envs/identity_env.py @@ -1,8 +1,8 @@ from typing import Any, Dict, Generic, Optional, Tuple, TypeVar, Union -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.type_aliases import GymStepReturn @@ -34,18 +34,21 @@ def __init__(self, dim: Optional[int] = None, space: Optional[spaces.Space] = No self.num_resets = -1 # Becomes 0 after __init__ exits. self.reset() - def reset(self) -> T: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[T, Dict]: + if seed is not None: + super().reset(seed=seed) self.current_step = 0 self.num_resets += 1 self._choose_next_state() - return self.state + return self.state, {} - def step(self, action: T) -> Tuple[T, float, bool, Dict[str, Any]]: + def step(self, action: T) -> Tuple[T, float, bool, bool, Dict[str, Any]]: reward = self._get_reward(action) self._choose_next_state() self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} + terminated = False + truncated = self.current_step >= self.ep_length + return self.state, reward, terminated, truncated, {} def _choose_next_state(self) -> None: self.state = self.action_space.sample() @@ -71,12 +74,13 @@ def __init__(self, low: float = -1.0, high: float = 1.0, eps: float = 0.05, ep_l super().__init__(ep_length=ep_length, space=space) self.eps = eps - def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, Dict[str, Any]]: + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]: reward = self._get_reward(action) self._choose_next_state() self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} + terminated = False + truncated = self.current_step >= self.ep_length + return self.state, reward, terminated, truncated, {} def _get_reward(self, action: np.ndarray) -> float: return 1.0 if (self.state - self.eps) <= action <= (self.state + self.eps) else 0.0 @@ -138,15 +142,18 @@ def __init__( self.ep_length = 10 self.current_step = 0 - def reset(self) -> np.ndarray: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[np.ndarray, Dict]: + if seed is not None: + super().reset(seed=seed) self.current_step = 0 - return self.observation_space.sample() + return self.observation_space.sample(), {} def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: reward = 0.0 self.current_step += 1 - done = self.current_step >= self.ep_length - return self.observation_space.sample(), reward, done, {} + terminated = False + truncated = self.current_step >= self.ep_length + return self.observation_space.sample(), reward, terminated, truncated, {} def render(self, mode: str = "human") -> None: pass diff --git a/stable_baselines3/common/envs/multi_input_envs.py b/stable_baselines3/common/envs/multi_input_envs.py index 166c6991a..3bb07106a 100644 --- a/stable_baselines3/common/envs/multi_input_envs.py +++ b/stable_baselines3/common/envs/multi_input_envs.py @@ -1,8 +1,8 @@ -from typing import Dict, Union +from typing import Dict, Optional, Tuple, Union -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.type_aliases import GymStepReturn @@ -153,11 +153,12 @@ def step(self, action: Union[float, np.ndarray]) -> GymStepReturn: got_to_end = self.state == self.max_state reward = 1 if got_to_end else reward - done = self.count > self.max_count or got_to_end + truncated = self.count > self.max_count + terminated = got_to_end self.log = f"Went {self.action2str[action]} in state {prev_state}, got to state {self.state}" - return self.get_state_mapping(), reward, done, {"got_to_end": got_to_end} + return self.get_state_mapping(), reward, terminated, truncated, {"got_to_end": got_to_end} def render(self, mode: str = "human") -> None: """ @@ -167,15 +168,18 @@ def render(self, mode: str = "human") -> None: """ print(self.log) - def reset(self) -> Dict[str, np.ndarray]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[Dict[str, np.ndarray], Dict]: """ Resets the environment state and step count and returns reset observation. + :param seed: :return: observation dict {'vec': ..., 'img': ...} """ + if seed is not None: + super().reset(seed=seed) self.count = 0 if not self.random_start: self.state = 0 else: self.state = np.random.randint(0, self.max_state) - return self.state_mapping[self.state] + return self.state_mapping[self.state], {} diff --git a/stable_baselines3/common/evaluation.py b/stable_baselines3/common/evaluation.py index 3b634375b..c9253a899 100644 --- a/stable_baselines3/common/evaluation.py +++ b/stable_baselines3/common/evaluation.py @@ -1,7 +1,7 @@ import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union -import gym +import gymnasium as gym import numpy as np from stable_baselines3.common import type_aliases @@ -59,7 +59,7 @@ def evaluate_policy( from stable_baselines3.common.monitor import Monitor if not isinstance(env, VecEnv): - env = DummyVecEnv([lambda: env]) + env = DummyVecEnv([lambda: env]) # type: ignore[list-item, return-value] is_monitor_wrapped = is_vecenv_wrapped(env, VecMonitor) or env.env_is_wrapped(Monitor)[0] @@ -85,7 +85,12 @@ def evaluate_policy( states = None episode_starts = np.ones((env.num_envs,), dtype=bool) while (episode_counts < episode_count_targets).any(): - actions, states = model.predict(observations, state=states, episode_start=episode_starts, deterministic=deterministic) + actions, states = model.predict( + observations, # type: ignore[arg-type] + state=states, + episode_start=episode_starts, + deterministic=deterministic, + ) new_observations, rewards, dones, infos = env.step(actions) current_rewards += rewards current_lengths += 1 diff --git a/stable_baselines3/common/monitor.py b/stable_baselines3/common/monitor.py index b8ebc2bac..9c3bd54a8 100644 --- a/stable_baselines3/common/monitor.py +++ b/stable_baselines3/common/monitor.py @@ -5,16 +5,14 @@ import os import time from glob import glob -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, SupportsFloat, Tuple, Union -import gym -import numpy as np +import gymnasium as gym import pandas +from gymnasium.core import ActType, ObsType -from stable_baselines3.common.type_aliases import GymObs, GymStepReturn - -class Monitor(gym.Wrapper): +class Monitor(gym.Wrapper[ObsType, ActType, ObsType, ActType]): """ A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data. @@ -43,9 +41,10 @@ def __init__( self.t_start = time.time() self.results_writer = None if filename is not None: + env_id = env.spec.id if env.spec is not None else None self.results_writer = ResultsWriter( filename, - header={"t_start": self.t_start, "env_id": env.spec and env.spec.id}, + header={"t_start": self.t_start, "env_id": str(env_id)}, extra_keys=reset_keywords + info_keywords, override_existing=override_existing, ) @@ -62,7 +61,7 @@ def __init__( # extra info about the current episode, that was passed in during reset() self.current_reset_info: Dict[str, Any] = {} - def reset(self, **kwargs) -> GymObs: + def reset(self, **kwargs) -> Tuple[ObsType, Dict[str, Any]]: """ Calls the Gym environment reset. Can only be called if the environment is over, or if allow_early_resets is True @@ -83,18 +82,18 @@ def reset(self, **kwargs) -> GymObs: self.current_reset_info[key] = value return self.env.reset(**kwargs) - def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: + def step(self, action: ActType) -> Tuple[ObsType, SupportsFloat, bool, bool, Dict[str, Any]]: """ Step the environment with the given action :param action: the action - :return: observation, reward, done, information + :return: observation, reward, terminated, truncated, information """ if self.needs_reset: raise RuntimeError("Tried to step environment that needs reset") - observation, reward, done, info = self.env.step(action) - self.rewards.append(reward) - if done: + observation, reward, terminated, truncated, info = self.env.step(action) + self.rewards.append(float(reward)) + if terminated or truncated: self.needs_reset = True ep_rew = sum(self.rewards) ep_len = len(self.rewards) @@ -109,7 +108,7 @@ def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: self.results_writer.write_row(ep_info) info["episode"] = ep_info self.total_steps += 1 - return observation, reward, done, info + return observation, reward, terminated, truncated, info def close(self) -> None: """ diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py index a4524517d..e3e6c594a 100644 --- a/stable_baselines3/common/off_policy_algorithm.py +++ b/stable_baselines3/common/off_policy_algorithm.py @@ -8,7 +8,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import DictReplayBuffer, ReplayBuffer @@ -75,6 +75,8 @@ class OffPolicyAlgorithm(BaseAlgorithm): :param supported_action_spaces: The action spaces supported by the algorithm. """ + actor: th.nn.Module + def __init__( self, policy: Union[str, Type[BasePolicy]], @@ -103,7 +105,7 @@ def __init__( sde_sample_freq: int = -1, use_sde_at_warmup: bool = False, sde_support: bool = True, - supported_action_spaces: Optional[Tuple[spaces.Space, ...]] = None, + supported_action_spaces: Optional[Tuple[Type[spaces.Space], ...]] = None, ): super().__init__( policy=policy, @@ -129,6 +131,7 @@ def __init__( self.gradient_steps = gradient_steps self.action_noise = action_noise self.optimize_memory_usage = optimize_memory_usage + self.replay_buffer: Optional[ReplayBuffer] = None self.replay_buffer_class = replay_buffer_class self.replay_buffer_kwargs = replay_buffer_kwargs or {} self._episode_storage = None @@ -136,8 +139,6 @@ def __init__( # Save train freq parameter, will be converted later to TrainFreq object self.train_freq = train_freq - self.actor = None # type: Optional[th.nn.Module] - self.replay_buffer: Optional[ReplayBuffer] = None # Update policy keyword arguments if sde_support: self.policy_kwargs["use_sde"] = self.use_sde diff --git a/stable_baselines3/common/on_policy_algorithm.py b/stable_baselines3/common/on_policy_algorithm.py index 0e6f31616..87e192990 100644 --- a/stable_baselines3/common/on_policy_algorithm.py +++ b/stable_baselines3/common/on_policy_algorithm.py @@ -4,7 +4,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.buffers import DictRolloutBuffer, RolloutBuffer @@ -52,6 +52,9 @@ class OnPolicyAlgorithm(BaseAlgorithm): :param supported_action_spaces: The action spaces supported by the algorithm. """ + rollout_buffer: RolloutBuffer + policy: ActorCriticPolicy + def __init__( self, policy: Union[str, Type[ActorCriticPolicy]], @@ -73,7 +76,7 @@ def __init__( seed: Optional[int] = None, device: Union[th.device, str] = "auto", _init_setup_model: bool = True, - supported_action_spaces: Optional[Tuple[spaces.Space, ...]] = None, + supported_action_spaces: Optional[Tuple[Type[spaces.Space], ...]] = None, ): super().__init__( policy=policy, @@ -97,7 +100,6 @@ def __init__( self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm - self.rollout_buffer = None if _init_setup_model: self._setup_model() @@ -117,13 +119,11 @@ def _setup_model(self) -> None: gae_lambda=self.gae_lambda, n_envs=self.n_envs, ) - self.policy = self.policy_class( # pytype:disable=not-instantiable - self.observation_space, - self.action_space, - self.lr_schedule, - use_sde=self.use_sde, - **self.policy_kwargs # pytype:disable=not-instantiable + # pytype:disable=not-instantiable + self.policy = self.policy_class( # type: ignore[assignment] + self.observation_space, self.action_space, self.lr_schedule, use_sde=self.use_sde, **self.policy_kwargs ) + # pytype:enable=not-instantiable self.policy = self.policy.to(self.device) def collect_rollouts( @@ -201,16 +201,23 @@ def collect_rollouts( ): terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0] with th.no_grad(): - terminal_value = self.policy.predict_values(terminal_obs)[0] + terminal_value = self.policy.predict_values(terminal_obs)[0] # type: ignore[arg-type] rewards[idx] += self.gamma * terminal_value - rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs) - self._last_obs = new_obs + rollout_buffer.add( + self._last_obs, # type: ignore[arg-type] + actions, + rewards, + self._last_episode_starts, # type: ignore[arg-type] + values, + log_probs, + ) + self._last_obs = new_obs # type: ignore[assignment] self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep - values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) + values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) # type: ignore[arg-type] rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) @@ -246,6 +253,8 @@ def learn( callback.on_training_start(locals(), globals()) + assert self.env is not None + while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) @@ -257,6 +266,7 @@ def learn( # Display training infos if log_interval is not None and iteration % log_interval == 0: + assert self.ep_info_buffer is not None time_elapsed = max((time.time_ns() - self.start_time) / 1e9, sys.float_info.epsilon) fps = int((self.num_timesteps - self._num_timesteps_at_start) / time_elapsed) self.logger.record("time/iterations", iteration, exclude="tensorboard") diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py index c67c45cf2..21d2034d6 100644 --- a/stable_baselines3/common/policies.py +++ b/stable_baselines3/common/policies.py @@ -9,7 +9,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch import nn from stable_baselines3.common.distributions import ( @@ -58,6 +58,8 @@ class BaseModel(nn.Module): excluding the learning rate, to pass to the optimizer """ + optimizer: th.optim.Optimizer + def __init__( self, observation_space: spaces.Space, @@ -84,7 +86,6 @@ def __init__( self.optimizer_class = optimizer_class self.optimizer_kwargs = optimizer_kwargs - self.optimizer: th.optim.Optimizer self.features_extractor_class = features_extractor_class self.features_extractor_kwargs = features_extractor_kwargs @@ -279,6 +280,8 @@ class BasePolicy(BaseModel, ABC): or not using a ``tanh()`` function. """ + features_extractor: BaseFeaturesExtractor + def __init__(self, *args, squash_output: bool = False, **kwargs): super().__init__(*args, **kwargs) self._squash_output = squash_output @@ -898,9 +901,9 @@ class ContinuousCritic(BaseModel): def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, net_arch: List[int], - features_extractor: nn.Module, + features_extractor: BaseFeaturesExtractor, features_dim: int, activation_fn: Type[nn.Module] = nn.ReLU, normalize_images: bool = True, diff --git a/stable_baselines3/common/preprocessing.py b/stable_baselines3/common/preprocessing.py index e280ed731..bc0959480 100644 --- a/stable_baselines3/common/preprocessing.py +++ b/stable_baselines3/common/preprocessing.py @@ -3,7 +3,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch.nn import functional as F @@ -158,10 +158,7 @@ def get_obs_shape( return (int(len(observation_space.nvec)),) elif isinstance(observation_space, spaces.MultiBinary): # Number of binary features - if type(observation_space.n) in [tuple, list, np.ndarray]: - return tuple(observation_space.n) - else: - return (int(observation_space.n),) + return observation_space.shape elif isinstance(observation_space, spaces.Dict): return {key: get_obs_shape(subspace) for (key, subspace) in observation_space.spaces.items()} # type: ignore[misc] @@ -205,18 +202,20 @@ def get_action_dim(action_space: spaces.Space) -> int: return int(len(action_space.nvec)) elif isinstance(action_space, spaces.MultiBinary): # Number of binary actions + assert isinstance( + action_space.n, int + ), "Multi-dimensional MultiBinary action space is not supported. You can flatten it instead." return int(action_space.n) else: raise NotImplementedError(f"{action_space} action space is not supported") -def check_for_nested_spaces(obs_space: spaces.Space): +def check_for_nested_spaces(obs_space: spaces.Space) -> None: """ Make sure the observation space does not have nested spaces (Dicts/Tuples inside Dicts/Tuples). If so, raise an Exception informing that there is no support for this. :param obs_space: an observation space - :return: """ if isinstance(obs_space, (spaces.Dict, spaces.Tuple)): sub_spaces = obs_space.spaces.values() if isinstance(obs_space, spaces.Dict) else obs_space.spaces diff --git a/stable_baselines3/common/save_util.py b/stable_baselines3/common/save_util.py index e5aeb662b..3c01a3f26 100644 --- a/stable_baselines3/common/save_util.py +++ b/stable_baselines3/common/save_util.py @@ -367,7 +367,7 @@ def load_from_zip_file( device: Union[th.device, str] = "auto", verbose: int = 0, print_system_info: bool = False, -) -> Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]: +) -> Tuple[Optional[Dict[str, Any]], TensorDict, Optional[TensorDict]]: """ Load model data from a .zip archive diff --git a/stable_baselines3/common/torch_layers.py b/stable_baselines3/common/torch_layers.py index 44714d6fe..ad6c7eef1 100644 --- a/stable_baselines3/common/torch_layers.py +++ b/stable_baselines3/common/torch_layers.py @@ -1,8 +1,8 @@ from typing import Dict, List, Tuple, Type, Union -import gym +import gymnasium as gym import torch as th -from gym import spaces +from gymnasium import spaces from torch import nn from stable_baselines3.common.preprocessing import get_flattened_obs_dim, is_image_space @@ -63,10 +63,14 @@ class NatureCNN(BaseFeaturesExtractor): def __init__( self, - observation_space: spaces.Box, + observation_space: gym.Space, features_dim: int = 512, normalized_image: bool = False, ) -> None: + assert isinstance(observation_space, spaces.Box), ( + "NatureCNN must be used with a gym.spaces.Box ", + f"observation space, not {observation_space}", + ) super().__init__(observation_space, features_dim) # We assume CxHxW images (channels first) # Re-ordering will be done by pre-preprocessing or wrapper diff --git a/stable_baselines3/common/type_aliases.py b/stable_baselines3/common/type_aliases.py index 7227667a1..d38d7cf73 100644 --- a/stable_baselines3/common/type_aliases.py +++ b/stable_baselines3/common/type_aliases.py @@ -2,9 +2,9 @@ import sys from enum import Enum -from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, NamedTuple, Optional, SupportsFloat, Tuple, Union -import gym +import gymnasium as gym import numpy as np import torch as th @@ -17,8 +17,11 @@ GymEnv = Union[gym.Env, vec_env.VecEnv] GymObs = Union[Tuple, Dict[str, Any], np.ndarray, int] -GymStepReturn = Tuple[GymObs, float, bool, Dict] -TensorDict = Dict[Union[str, int], th.Tensor] +GymResetReturn = Tuple[GymObs, Dict] +AtariResetReturn = Tuple[np.ndarray, Dict[str, Any]] +GymStepReturn = Tuple[GymObs, float, bool, bool, Dict] +AtariStepReturn = Tuple[np.ndarray, SupportsFloat, bool, bool, Dict[str, Any]] +TensorDict = Dict[str, th.Tensor] OptimizerStateDict = Dict[str, Any] MaybeCallback = Union[None, Callable, List[callbacks.BaseCallback], callbacks.BaseCallback] diff --git a/stable_baselines3/common/utils.py b/stable_baselines3/common/utils.py index 1234fba79..d20cc8529 100644 --- a/stable_baselines3/common/utils.py +++ b/stable_baselines3/common/utils.py @@ -4,13 +4,14 @@ import random import re from collections import deque +from inspect import signature from itertools import zip_longest from typing import Dict, Iterable, List, Optional, Tuple, Union -import gym +import gymnasium as gym import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces import stable_baselines3 as sb3 @@ -470,9 +471,7 @@ def polyak_update( th.add(target_param.data, param.data, alpha=tau, out=target_param.data) -def obs_as_tensor( - obs: Union[np.ndarray, Dict[Union[str, int], np.ndarray]], device: th.device -) -> Union[th.Tensor, TensorDict]: +def obs_as_tensor(obs: Union[np.ndarray, Dict[str, np.ndarray]], device: th.device) -> Union[th.Tensor, TensorDict]: """ Moves the observation to the given device. @@ -541,3 +540,18 @@ def get_system_info(print_info: bool = True) -> Tuple[Dict[str, str], str]: if print_info: print(env_info_str) return env_info, env_info_str + + +def compat_gym_seed(env: GymEnv, seed: int) -> None: + """ + Compatibility helper to seed Gym envs. + + :param env: The Gym environment. + :param seed: The seed for the pseudo random generator + """ + if "seed" in signature(env.unwrapped.reset).parameters: + # gym >= 0.23.1 + env.reset(seed=seed) + else: + # VecEnv and backward compatibility + env.seed(seed) diff --git a/stable_baselines3/common/vec_env/base_vec_env.py b/stable_baselines3/common/vec_env/base_vec_env.py index 708c021aa..a15750360 100644 --- a/stable_baselines3/common/vec_env/base_vec_env.py +++ b/stable_baselines3/common/vec_env/base_vec_env.py @@ -4,9 +4,9 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Type, Union import cloudpickle -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces # Define type aliases here to avoid circular import # Used when we want to access one or more VecEnv @@ -54,12 +54,20 @@ class VecEnv(ABC): :param action_space: Action space """ - metadata = {"render.modes": ["human", "rgb_array"]} + metadata = {"render_modes": ["human", "rgb_array"]} - def __init__(self, num_envs: int, observation_space: spaces.Space, action_space: spaces.Space): + def __init__( + self, + num_envs: int, + observation_space: spaces.Space, + action_space: spaces.Space, + render_mode: Optional[str] = None, + ): self.num_envs = num_envs self.observation_space = observation_space self.action_space = action_space + self.render_mode = render_mode + self.reset_infos = [{} for _ in range(num_envs)] # store info returned by the reset method @abstractmethod def reset(self) -> VecEnvObs: @@ -162,35 +170,72 @@ def step(self, actions: np.ndarray) -> VecEnvStepReturn: self.step_async(actions) return self.step_wait() - def get_images(self) -> Sequence[np.ndarray]: + def get_images(self) -> Sequence[Optional[np.ndarray]]: """ - Return RGB images from each environment + Return RGB images from each environment when available """ raise NotImplementedError - def render(self, mode: str = "human") -> Optional[np.ndarray]: + def render(self, mode: Optional[str] = None) -> Optional[np.ndarray]: """ Gym environment rendering :param mode: the rendering type """ - try: - imgs = self.get_images() - except NotImplementedError: - warnings.warn(f"Render not defined for {self}") + + if mode == "human" and self.render_mode != mode: + # Special case, if the render_mode="rgb_array" + # we can still display that image using opencv + if self.render_mode != "rgb_array": + warnings.warn( + f"You tried to render a VecEnv with mode='{mode}' " + "but the render mode defined when initializing the environment must be " + f"'human' or 'rgb_array', not '{self.render_mode}'." + ) + return + + elif mode and self.render_mode != mode: + warnings.warn( + f"""Starting from gymnasium v0.26, render modes are determined during the initialization of the environment. + We allow to pass a mode argument to maintain a backwards compatible VecEnv API, but the mode ({mode}) + has to be the same as the environment render mode ({self.render_mode}) which is not the case.""" + ) + return + + mode = mode or self.render_mode + + if mode is None: + warnings.warn("You tried to call render() but no `render_mode` was passed to the env constructor.") + return + + # mode == self.render_mode == "human" + # In that case, we try to call `self.env.render()` but it might + # crash for subprocesses + if self.render_mode == "human": + self.env_method("render") return - # Create a big image by tiling images from subprocesses - bigimg = tile_images(imgs) - if mode == "human": - import cv2 # pytype:disable=import-error + if mode == "rgb_array" or mode == "human": + # call the render method of the environments + images = self.get_images() + # Create a big image by tiling images from subprocesses + bigimg = tile_images(images) + + if mode == "human": + # Display it using OpenCV + import cv2 # pytype:disable=import-error + + cv2.imshow("vecenv", bigimg[:, :, ::-1]) + cv2.waitKey(1) + else: + return bigimg - cv2.imshow("vecenv", bigimg[:, :, ::-1]) - cv2.waitKey(1) - elif mode == "rgb_array": - return bigimg else: - raise NotImplementedError(f"Render mode {mode} is not supported by VecEnvs") + # Other render modes: + # In that case, we try to call `self.env.render()` but it might + # crash for subprocesses + # and we don't return the values + self.env_method("render") @abstractmethod def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: @@ -251,6 +296,7 @@ def __init__( venv: VecEnv, observation_space: Optional[spaces.Space] = None, action_space: Optional[spaces.Space] = None, + render_mode: Optional[str] = None, ): self.venv = venv VecEnv.__init__( @@ -258,6 +304,7 @@ def __init__( num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space, action_space=action_space or venv.action_space, + render_mode=render_mode, ) self.class_attributes = dict(inspect.getmembers(self.__class__)) @@ -278,10 +325,10 @@ def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: def close(self) -> None: return self.venv.close() - def render(self, mode: str = "human") -> Optional[np.ndarray]: + def render(self, mode: Optional[str] = None) -> Optional[np.ndarray]: return self.venv.render(mode=mode) - def get_images(self) -> Sequence[np.ndarray]: + def get_images(self) -> Sequence[Optional[np.ndarray]]: return self.venv.get_images() def get_attr(self, attr_name: str, indices: VecEnvIndices = None) -> List[Any]: diff --git a/stable_baselines3/common/vec_env/dummy_vec_env.py b/stable_baselines3/common/vec_env/dummy_vec_env.py index 5b9fc8b40..7f092e040 100644 --- a/stable_baselines3/common/vec_env/dummy_vec_env.py +++ b/stable_baselines3/common/vec_env/dummy_vec_env.py @@ -1,18 +1,20 @@ +import warnings from collections import OrderedDict from copy import deepcopy from typing import Any, Callable, List, Optional, Sequence, Type, Union -import gym +import gymnasium as gym import numpy as np from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvIndices, VecEnvObs, VecEnvStepReturn +from stable_baselines3.common.vec_env.patch_gym import _patch_env from stable_baselines3.common.vec_env.util import copy_obs_dict, dict_to_obs, obs_space_info class DummyVecEnv(VecEnv): """ Creates a simple vectorized wrapper for multiple environments, calling each environment in sequence on the current - Python process. This is useful for computationally simple environment such as ``cartpole-v1``, + Python process. This is useful for computationally simple environment such as ``Cartpole-v1``, as the overhead of multiprocess or multithread outweighs the environment computation time. This can also be used for RL methods that require a vectorized environment, but that you want a single environments to train with. @@ -23,7 +25,7 @@ class DummyVecEnv(VecEnv): """ def __init__(self, env_fns: List[Callable[[], gym.Env]]): - self.envs = [fn() for fn in env_fns] + self.envs = [_patch_env(fn()) for fn in env_fns] if len(set([id(env.unwrapped) for env in self.envs])) != len(self.envs): raise ValueError( "You tried to create multiple environments, but the function to create them returned the same instance " @@ -35,7 +37,7 @@ def __init__(self, env_fns: List[Callable[[], gym.Env]]): "Please read https://github.com/DLR-RM/stable-baselines3/issues/1151 for more information." ) env = self.envs[0] - VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) + VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space, env.render_mode) obs_space = env.observation_space self.keys, shapes, dtypes = obs_space_info(obs_space) @@ -50,28 +52,38 @@ def step_async(self, actions: np.ndarray) -> None: self.actions = actions def step_wait(self) -> VecEnvStepReturn: + # Avoid circular imports for env_idx in range(self.num_envs): - obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] = self.envs[env_idx].step( + obs, self.buf_rews[env_idx], terminated, truncated, self.buf_infos[env_idx] = self.envs[env_idx].step( self.actions[env_idx] ) + # convert to SB3 VecEnv api + self.buf_dones[env_idx] = terminated or truncated + # See https://github.com/openai/gym/issues/3102 + # Gym 0.26 introduces a breaking change + self.buf_infos[env_idx]["TimeLimit.truncated"] = truncated and not terminated + if self.buf_dones[env_idx]: # save final observation where user can get it, then reset self.buf_infos[env_idx]["terminal_observation"] = obs - obs = self.envs[env_idx].reset() + obs, self.reset_infos[env_idx] = self.envs[env_idx].reset() self._save_obs(env_idx, obs) return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), deepcopy(self.buf_infos)) def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: + # Avoid circular import + from stable_baselines3.common.utils import compat_gym_seed + if seed is None: seed = np.random.randint(0, 2**32 - 1) seeds = [] for idx, env in enumerate(self.envs): - seeds.append(env.seed(seed + idx)) + seeds.append(compat_gym_seed(env, seed=seed + idx)) return seeds def reset(self) -> VecEnvObs: for env_idx in range(self.num_envs): - obs = self.envs[env_idx].reset() + obs, self.reset_infos[env_idx] = self.envs[env_idx].reset() self._save_obs(env_idx, obs) return self._obs_from_buf() @@ -79,25 +91,22 @@ def close(self) -> None: for env in self.envs: env.close() - def get_images(self) -> Sequence[np.ndarray]: - return [env.render(mode="rgb_array") for env in self.envs] + def get_images(self) -> Sequence[Optional[np.ndarray]]: + if self.render_mode != "rgb_array": + warnings.warn( + f"The render mode is {self.render_mode}, but this method assumes it is `rgb_array` to obtain images." + ) + return [None for _ in self.envs] + return [env.render() for env in self.envs] - def render(self, mode: str = "human") -> Optional[np.ndarray]: + def render(self, mode: Optional[str] = None) -> Optional[np.ndarray]: """ Gym environment rendering. If there are multiple environments then they are tiled together in one image via ``BaseVecEnv.render()``. - Otherwise (if ``self.num_envs == 1``), we pass the render call directly to the - underlying environment. - - Therefore, some arguments such as ``mode`` will have values that are valid - only when ``num_envs == 1``. :param mode: The rendering type. """ - if self.num_envs == 1: - return self.envs[0].render(mode=mode) - else: - return super().render(mode=mode) + return super().render(mode=mode) def _save_obs(self, env_idx: int, obs: VecEnvObs) -> None: for key in self.keys: diff --git a/stable_baselines3/common/vec_env/patch_gym.py b/stable_baselines3/common/vec_env/patch_gym.py new file mode 100644 index 000000000..b86c52236 --- /dev/null +++ b/stable_baselines3/common/vec_env/patch_gym.py @@ -0,0 +1,100 @@ +import warnings +from inspect import signature +from typing import Union + +import gymnasium + +try: + import gym # pytype: disable=import-error + + gym_installed = True +except ImportError: + gym_installed = False + + +def _patch_env(env: Union["gym.Env", gymnasium.Env]) -> gymnasium.Env: # pragma: no cover + """ + Adapted from https://github.com/thu-ml/tianshou. + + Takes an environment and patches it to return Gymnasium env. + This function takes the environment object and returns a patched + env, using shimmy wrapper to convert it to Gymnasium, + if necessary. + + :param env: A gym/gymnasium env + :return: Patched env (gymnasium env) + """ + + # Gymnasium env, no patching to be done + if isinstance(env, gymnasium.Env): + return env + + if not gym_installed or not isinstance(env, gym.Env): + raise ValueError( + f"The environment is of type {type(env)}, not a Gymnasium " + f"environment. In this case, we expect OpenAI Gym to be " + f"installed and the environment to be an OpenAI Gym environment." + ) + + try: + import shimmy # pytype: disable=import-error + except ImportError as e: + raise ImportError( + "Missing shimmy installation. You an OpenAI Gym environment. " + "Stable-Baselines3 (SB3) has transitioned to using Gymnasium internally. " + "In order to use OpenAI Gym environments with SB3, you need to " + "install shimmy (`pip install 'shimmy>=0.2.1'`)." + ) from e + + warnings.warn( + "You provided an OpenAI Gym environment. " + "We strongly recommend transitioning to Gymnasium environments. " + "Stable-Baselines3 is automatically wrapping your environments in a compatibility " + "layer, which could potentially cause issues." + ) + + if "seed" in signature(env.unwrapped.reset).parameters: + # Gym 0.26+ env + return shimmy.GymV26CompatibilityV0(env=env) + # Gym 0.21 env + return shimmy.GymV21CompatibilityV0(env=env) + + +def _convert_space(space: Union["gym.Space", gymnasium.Space]) -> gymnasium.Space: # pragma: no cover + """ + Takes a space and patches it to return Gymnasium Space. + This function takes the space object and returns a patched + space, using shimmy wrapper to convert it to Gymnasium, + if necessary. + + :param env: A gym/gymnasium Space + :return: Patched space (gymnasium Space) + """ + + # Gymnasium space, no convertion to be done + if isinstance(space, gymnasium.Space): + return space + + if not gym_installed or not isinstance(space, gym.Space): + raise ValueError( + f"The space is of type {type(space)}, not a Gymnasium " + f"space. In this case, we expect OpenAI Gym to be " + f"installed and the space to be an OpenAI Gym space." + ) + + try: + import shimmy # pytype: disable=import-error + except ImportError as e: + raise ImportError( + "Missing shimmy installation. You provided an OpenAI Gym space. " + "Stable-Baselines3 (SB3) has transitioned to using Gymnasium internally. " + "In order to use OpenAI Gym space with SB3, you need to " + "install shimmy (`pip install 'shimmy>=0.2.1'`)." + ) from e + + warnings.warn( + "You loaded a model that was trained using OpenAI Gym. " + "We strongly recommend transitioning to Gymnasium by saving that model again." + ) + + return shimmy.openai_gym_compatibility._convert_space(space) diff --git a/stable_baselines3/common/vec_env/stacked_observations.py b/stable_baselines3/common/vec_env/stacked_observations.py index e674ee08a..bf375e165 100644 --- a/stable_baselines3/common/vec_env/stacked_observations.py +++ b/stable_baselines3/common/vec_env/stacked_observations.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar, Union import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.preprocessing import is_image_space, is_image_space_channels_first @@ -31,7 +31,7 @@ def __init__( self, num_envs: int, n_stack: int, - observation_space: Union[spaces.Box, spaces.Dict], # Replace by Space[TObs] in gym>=0.26 + observation_space: Union[spaces.Box, spaces.Dict], channels_order: Optional[Union[str, Mapping[str, Optional[str]]]] = None, ) -> None: self.n_stack = n_stack @@ -40,12 +40,12 @@ def __init__( if not isinstance(channels_order, Mapping): channels_order = {key: channels_order for key in observation_space.spaces.keys()} self.sub_stacked_observations = { - key: StackedObservations(num_envs, n_stack, subspace, channels_order[key]) + key: StackedObservations(num_envs, n_stack, subspace, channels_order[key]) # type: ignore[arg-type] for key, subspace in observation_space.spaces.items() } self.stacked_observation_space = spaces.Dict( {key: substack_obs.stacked_observation_space for key, substack_obs in self.sub_stacked_observations.items()} - ) # type: spaces.Dict # make mypy happy + ) # type: Union[spaces.Dict, spaces.Box] # make mypy happy elif isinstance(observation_space, spaces.Box): if isinstance(channels_order, Mapping): raise TypeError("When the observation space is Box, channels_order can't be a dict.") @@ -55,7 +55,11 @@ def __init__( ) low = np.repeat(observation_space.low, n_stack, axis=self.repeat_axis) high = np.repeat(observation_space.high, n_stack, axis=self.repeat_axis) - self.stacked_observation_space = spaces.Box(low=low, high=high, dtype=observation_space.dtype) + self.stacked_observation_space = spaces.Box( + low=low, + high=high, + dtype=observation_space.dtype, # type: ignore[arg-type] + ) self.stacked_obs = np.zeros((num_envs, *self.stacked_shape), dtype=observation_space.dtype) else: raise TypeError( @@ -97,36 +101,6 @@ def compute_stacking( stacked_shape[repeat_axis] *= n_stack return channels_first, stack_dimension, tuple(stacked_shape), repeat_axis - def stack_observation_space(self, observation_space: Union[spaces.Box, spaces.Dict]) -> Union[spaces.Box, spaces.Dict]: - """ - This function is deprecated. - - As an alternative, use - - .. code-block:: python - - low = np.repeat(observation_space.low, stacked_observation.n_stack, axis=stacked_observation.repeat_axis) - high = np.repeat(observation_space.high, stacked_observation.n_stack, axis=stacked_observation.repeat_axis) - stacked_observation_space = spaces.Box(low=low, high=high, dtype=observation_space.dtype) - - :return: New observation space with stacked dimensions - """ - warnings.warn( - "stack_observation_space is deprecated and will be removed in the next SB3 release. " - "Please refer to the docstring for a workaround.", - DeprecationWarning, - ) - if isinstance(observation_space, spaces.Dict): - return spaces.Dict( - { - key: sub_stacked_observation.stack_observation_space(sub_stacked_observation.observation_space) - for key, sub_stacked_observation in self.sub_stacked_observations.items() - } - ) - low = np.repeat(observation_space.low, self.n_stack, axis=self.repeat_axis) - high = np.repeat(observation_space.high, self.n_stack, axis=self.repeat_axis) - return spaces.Box(low=low, high=high, dtype=observation_space.dtype) - def reset(self, observation: TObs) -> TObs: """ Reset the stacked_obs, add the reset observation to the stack, and return the stack. diff --git a/stable_baselines3/common/vec_env/subproc_vec_env.py b/stable_baselines3/common/vec_env/subproc_vec_env.py index 7ff579d30..ccefd2078 100644 --- a/stable_baselines3/common/vec_env/subproc_vec_env.py +++ b/stable_baselines3/common/vec_env/subproc_vec_env.py @@ -1,10 +1,11 @@ import multiprocessing as mp +import warnings from collections import OrderedDict from typing import Any, Callable, List, Optional, Sequence, Tuple, Type, Union -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.vec_env.base_vec_env import ( CloudpickleWrapper, @@ -13,33 +14,41 @@ VecEnvObs, VecEnvStepReturn, ) +from stable_baselines3.common.vec_env.patch_gym import _patch_env def _worker( - remote: mp.connection.Connection, parent_remote: mp.connection.Connection, env_fn_wrapper: CloudpickleWrapper + remote: mp.connection.Connection, + parent_remote: mp.connection.Connection, + env_fn_wrapper: CloudpickleWrapper, ) -> None: # Import here to avoid a circular import from stable_baselines3.common.env_util import is_wrapped + from stable_baselines3.common.utils import compat_gym_seed parent_remote.close() - env = env_fn_wrapper.var() + env = _patch_env(env_fn_wrapper.var()) + reset_info = {} while True: try: cmd, data = remote.recv() if cmd == "step": - observation, reward, done, info = env.step(data) + observation, reward, terminated, truncated, info = env.step(data) + # convert to SB3 VecEnv api + done = terminated or truncated + info["TimeLimit.truncated"] = truncated and not terminated if done: # save final observation where user can get it, then reset info["terminal_observation"] = observation - observation = env.reset() - remote.send((observation, reward, done, info)) + observation, reset_info = env.reset() + remote.send((observation, reward, done, info, reset_info)) elif cmd == "seed": - remote.send(env.seed(data)) + remote.send(compat_gym_seed(env, seed=data)) elif cmd == "reset": - observation = env.reset() - remote.send(observation) + observation, reset_info = env.reset() + remote.send((observation, reset_info)) elif cmd == "render": - remote.send(env.render(data)) + remote.send(env.render()) elif cmd == "close": env.close() remote.close() @@ -110,7 +119,10 @@ def __init__(self, env_fns: List[Callable[[], gym.Env]], start_method: Optional[ self.remotes[0].send(("get_spaces", None)) observation_space, action_space = self.remotes[0].recv() - VecEnv.__init__(self, len(env_fns), observation_space, action_space) + + self.remotes[0].send(("get_attr", "render_mode")) + render_mode = self.remotes[0].recv() + VecEnv.__init__(self, len(env_fns), observation_space, action_space, render_mode) def step_async(self, actions: np.ndarray) -> None: for remote, action in zip(self.remotes, actions): @@ -120,7 +132,7 @@ def step_async(self, actions: np.ndarray) -> None: def step_wait(self) -> VecEnvStepReturn: results = [remote.recv() for remote in self.remotes] self.waiting = False - obs, rews, dones, infos = zip(*results) + obs, rews, dones, infos, self.reset_infos = zip(*results) return _flatten_obs(obs, self.observation_space), np.stack(rews), np.stack(dones), infos def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: @@ -133,7 +145,8 @@ def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: def reset(self) -> VecEnvObs: for remote in self.remotes: remote.send(("reset", None)) - obs = [remote.recv() for remote in self.remotes] + results = [remote.recv() for remote in self.remotes] + obs, self.reset_infos = zip(*results) return _flatten_obs(obs, self.observation_space) def close(self) -> None: @@ -148,13 +161,17 @@ def close(self) -> None: process.join() self.closed = True - def get_images(self) -> Sequence[np.ndarray]: + def get_images(self) -> Sequence[Optional[np.ndarray]]: + if self.render_mode != "rgb_array": + warnings.warn( + f"The render mode is {self.render_mode}, but this method assumes it is `rgb_array` to obtain images." + ) + return [None for _ in self.remotes] for pipe in self.remotes: - # gather images from subprocesses - # `mode` will be taken into account later - pipe.send(("render", "rgb_array")) - imgs = [pipe.recv() for pipe in self.remotes] - return imgs + # gather render return from subprocesses + pipe.send(("render", None)) + outputs = [pipe.recv() for pipe in self.remotes] + return outputs def get_attr(self, attr_name: str, indices: VecEnvIndices = None) -> List[Any]: """Return attribute from vectorized environment (see base class).""" diff --git a/stable_baselines3/common/vec_env/util.py b/stable_baselines3/common/vec_env/util.py index 7d318acff..6d55db817 100644 --- a/stable_baselines3/common/vec_env/util.py +++ b/stable_baselines3/common/vec_env/util.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Tuple import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.preprocessing import check_for_nested_spaces from stable_baselines3.common.vec_env.base_vec_env import VecEnvObs diff --git a/stable_baselines3/common/vec_env/vec_check_nan.py b/stable_baselines3/common/vec_env/vec_check_nan.py index 98ad217f6..170f36ec8 100644 --- a/stable_baselines3/common/vec_env/vec_check_nan.py +++ b/stable_baselines3/common/vec_env/vec_check_nan.py @@ -2,7 +2,7 @@ from typing import List, Tuple import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs, VecEnvStepReturn, VecEnvWrapper diff --git a/stable_baselines3/common/vec_env/vec_frame_stack.py b/stable_baselines3/common/vec_env/vec_frame_stack.py index 8a020ddd6..200201f06 100644 --- a/stable_baselines3/common/vec_env/vec_frame_stack.py +++ b/stable_baselines3/common/vec_env/vec_frame_stack.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvWrapper from stable_baselines3.common.vec_env.stacked_observations import StackedObservations @@ -35,6 +35,9 @@ def step_wait( return observations, rewards, dones, infos def reset(self) -> Union[np.ndarray, Dict[str, np.ndarray]]: + """ + Reset all environments + """ observation = self.venv.reset() # pytype:disable=annotation-type-mismatch observation = self.stacked_obs.reset(observation) return observation diff --git a/stable_baselines3/common/vec_env/vec_normalize.py b/stable_baselines3/common/vec_env/vec_normalize.py index ffa813fa0..ebefa82d1 100644 --- a/stable_baselines3/common/vec_env/vec_normalize.py +++ b/stable_baselines3/common/vec_env/vec_normalize.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Union import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common import utils from stable_baselines3.common.preprocessing import is_image_space diff --git a/stable_baselines3/common/vec_env/vec_transpose.py b/stable_baselines3/common/vec_env/vec_transpose.py index b6b0ad832..beb603961 100644 --- a/stable_baselines3/common/vec_env/vec_transpose.py +++ b/stable_baselines3/common/vec_env/vec_transpose.py @@ -2,7 +2,7 @@ from typing import Dict, Union import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.preprocessing import is_image_space, is_image_space_channels_first from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper diff --git a/stable_baselines3/common/vec_env/vec_video_recorder.py b/stable_baselines3/common/vec_env/vec_video_recorder.py index 83d058abc..6f670054b 100644 --- a/stable_baselines3/common/vec_env/vec_video_recorder.py +++ b/stable_baselines3/common/vec_env/vec_video_recorder.py @@ -1,7 +1,7 @@ import os from typing import Callable -from gym.wrappers.monitoring import video_recorder +from gymnasium.wrappers.monitoring import video_recorder from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs, VecEnvStepReturn, VecEnvWrapper from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv @@ -47,6 +47,7 @@ def __init__( metadata = temp_env.metadata self.env.metadata = metadata + assert self.env.render_mode == "rgb_array", f"The render_mode must be 'rgb_array', not {self.env.render_mode}" self.record_video_trigger = record_video_trigger self.video_recorder = None @@ -109,4 +110,4 @@ def close(self) -> None: self.close_video_recorder() def __del__(self): - self.close() + self.close_video_recorder() diff --git a/stable_baselines3/dqn/dqn.py b/stable_baselines3/dqn/dqn.py index 59909105f..b85a30f80 100644 --- a/stable_baselines3/dqn/dqn.py +++ b/stable_baselines3/dqn/dqn.py @@ -3,7 +3,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch.nn import functional as F from stable_baselines3.common.buffers import ReplayBuffer @@ -11,7 +11,7 @@ from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule from stable_baselines3.common.utils import get_linear_fn, get_parameters_by_name, polyak_update -from stable_baselines3.dqn.policies import CnnPolicy, DQNPolicy, MlpPolicy, MultiInputPolicy +from stable_baselines3.dqn.policies import CnnPolicy, DQNPolicy, MlpPolicy, MultiInputPolicy, QNetwork SelfDQN = TypeVar("SelfDQN", bound="DQN") @@ -67,6 +67,11 @@ class DQN(OffPolicyAlgorithm): "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, } + # Linear schedule will be defined in `_setup_model()` + exploration_schedule: Schedule + q_net: QNetwork + q_net_target: QNetwork + policy: DQNPolicy def __init__( self, @@ -131,10 +136,6 @@ def __init__( self.max_grad_norm = max_grad_norm # "epsilon" for the epsilon-greedy exploration self.exploration_rate = 0.0 - # Linear schedule will be defined in `_setup_model()` - self.exploration_schedule: Schedule - self.q_net: th.nn.Module - self.q_net_target: th.nn.Module if _init_setup_model: self._setup_model() @@ -164,8 +165,6 @@ def _setup_model(self) -> None: self.target_update_interval = max(self.target_update_interval // self.n_envs, 1) def _create_aliases(self) -> None: - # For type checker: - assert isinstance(self.policy, DQNPolicy) self.q_net = self.policy.q_net self.q_net_target = self.policy.q_net_target diff --git a/stable_baselines3/dqn/policies.py b/stable_baselines3/dqn/policies.py index 6991b8a80..fcdb95890 100644 --- a/stable_baselines3/dqn/policies.py +++ b/stable_baselines3/dqn/policies.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional, Type import torch as th -from gym import spaces +from gymnasium import spaces from torch import nn from stable_baselines3.common.policies import BasePolicy @@ -27,10 +27,12 @@ class QNetwork(BasePolicy): dividing by 255.0 (True by default) """ + action_space: spaces.Discrete + def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Discrete, features_extractor: BaseFeaturesExtractor, features_dim: int, net_arch: Optional[List[int]] = None, @@ -50,7 +52,7 @@ def __init__( self.net_arch = net_arch self.activation_fn = activation_fn self.features_dim = features_dim - action_dim = self.action_space.n # number of actions + action_dim = int(self.action_space.n) # number of actions q_net = create_mlp(self.features_dim, action_dim, self.net_arch, self.activation_fn) self.q_net = nn.Sequential(*q_net) @@ -61,8 +63,6 @@ def forward(self, obs: th.Tensor) -> th.Tensor: :param obs: Observation :return: The estimated Q-Value for each action. """ - # For type checker: - assert isinstance(self.features_extractor, BaseFeaturesExtractor) return self.q_net(self.extract_features(obs, self.features_extractor)) def _predict(self, observation: th.Tensor, deterministic: bool = True) -> th.Tensor: @@ -105,10 +105,13 @@ class DQNPolicy(BasePolicy): excluding the learning rate, to pass to the optimizer """ + q_net: QNetwork + q_net_target: QNetwork + def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Discrete, lr_schedule: Schedule, net_arch: Optional[List[int]] = None, activation_fn: Type[nn.Module] = nn.ReLU, @@ -145,8 +148,6 @@ def __init__( "normalize_images": normalize_images, } - self.q_net: QNetwork - self.q_net_target: QNetwork self._build(lr_schedule) def _build(self, lr_schedule: Schedule) -> None: @@ -234,7 +235,7 @@ class CnnPolicy(DQNPolicy): def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Discrete, lr_schedule: Schedule, net_arch: Optional[List[int]] = None, activation_fn: Type[nn.Module] = nn.ReLU, @@ -279,7 +280,7 @@ class MultiInputPolicy(DQNPolicy): def __init__( self, observation_space: spaces.Dict, - action_space: spaces.Space, + action_space: spaces.Discrete, lr_schedule: Schedule, net_arch: Optional[List[int]] = None, activation_fn: Type[nn.Module] = nn.ReLU, diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py index 5a438b411..9e06d6444 100644 --- a/stable_baselines3/her/her_replay_buffer.py +++ b/stable_baselines3/her/her_replay_buffer.py @@ -4,7 +4,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.buffers import DictReplayBuffer from stable_baselines3.common.type_aliases import DictReplayBufferSamples, TensorDict @@ -58,7 +58,6 @@ def __init__( n_sampled_goal: int = 4, goal_selection_strategy: Union[GoalSelectionStrategy, str] = "future", copy_info_dict: bool = False, - online_sampling: Optional[bool] = None, ): super().__init__( buffer_size, @@ -72,14 +71,6 @@ def __init__( self.env = env self.copy_info_dict = copy_info_dict - if online_sampling is not None: - assert online_sampling is True, "Since v1.8.0, SB3 only supports online sampling with HerReplayBuffer." - warnings.warn( - "Since v1.8.0, the `online_sampling` argument is deprecated " - "as SB3 only supports online sampling with HerReplayBuffer. It will be removed in v2.0", - stacklevel=1, - ) - # convert goal_selection_strategy into GoalSelectionStrategy if string if isinstance(goal_selection_strategy, str): self.goal_selection_strategy = KEY_TO_GOAL_STRATEGY[goal_selection_strategy.lower()] diff --git a/stable_baselines3/ppo/ppo.py b/stable_baselines3/ppo/ppo.py index e01c54ab8..0df51dc61 100644 --- a/stable_baselines3/ppo/ppo.py +++ b/stable_baselines3/ppo/ppo.py @@ -3,7 +3,7 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch.nn import functional as F from stable_baselines3.common.on_policy_algorithm import OnPolicyAlgorithm @@ -183,10 +183,10 @@ def train(self) -> None: # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) # Compute current clip range - clip_range = self.clip_range(self._current_progress_remaining) + clip_range = self.clip_range(self._current_progress_remaining) # type: ignore[operator] # Optional: clip range for the value function if self.clip_range_vf is not None: - clip_range_vf = self.clip_range_vf(self._current_progress_remaining) + clip_range_vf = self.clip_range_vf(self._current_progress_remaining) # type: ignore[operator] entropy_losses = [] pg_losses, value_losses = [], [] diff --git a/stable_baselines3/sac/policies.py b/stable_baselines3/sac/policies.py index e756097b1..8902629d4 100644 --- a/stable_baselines3/sac/policies.py +++ b/stable_baselines3/sac/policies.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union import torch as th -from gym import spaces +from gymnasium import spaces from torch import nn from stable_baselines3.common.distributions import SquashedDiagGaussianDistribution, StateDependentNoiseDistribution @@ -45,10 +45,12 @@ class Actor(BasePolicy): dividing by 255.0 (True by default) """ + action_space: spaces.Box + def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, net_arch: List[int], features_extractor: nn.Module, features_dim: int, @@ -96,9 +98,9 @@ def __init__( if clip_mean > 0.0: self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-clip_mean, max_val=clip_mean)) else: - self.action_dist = SquashedDiagGaussianDistribution(action_dim) + self.action_dist = SquashedDiagGaussianDistribution(action_dim) # type: ignore[assignment] self.mu = nn.Linear(last_layer_dim, action_dim) - self.log_std = nn.Linear(last_layer_dim, action_dim) + self.log_std = nn.Linear(last_layer_dim, action_dim) # type: ignore[assignment] def _get_constructor_parameters(self) -> Dict[str, Any]: data = super()._get_constructor_parameters() @@ -157,7 +159,7 @@ def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, if self.use_sde: return mean_actions, self.log_std, dict(latent_sde=latent_pi) # Unstructured exploration (Original implementation) - log_std = self.log_std(latent_pi) + log_std = self.log_std(latent_pi) # type: ignore[operator] # Original Implementation to cap the standard deviation log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) return mean_actions, log_std, {} @@ -205,10 +207,14 @@ class SACPolicy(BasePolicy): between the actor and the critic (this saves computation time) """ + actor: Actor + critic: ContinuousCritic + critic_target: ContinuousCritic + def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, @@ -267,15 +273,17 @@ def __init__( } ) - self.actor, self.actor_target = None, None - self.critic, self.critic_target = None, None self.share_features_extractor = share_features_extractor self._build(lr_schedule) def _build(self, lr_schedule: Schedule) -> None: self.actor = self.make_actor() - self.actor.optimizer = self.optimizer_class(self.actor.parameters(), lr=lr_schedule(1), **self.optimizer_kwargs) + self.actor.optimizer = self.optimizer_class( + self.actor.parameters(), + lr=lr_schedule(1), # type: ignore[call-arg] + **self.optimizer_kwargs, + ) if self.share_features_extractor: self.critic = self.make_critic(features_extractor=self.actor.features_extractor) @@ -286,13 +294,17 @@ def _build(self, lr_schedule: Schedule) -> None: # Create a separate features extractor for the critic # this requires more memory and computation self.critic = self.make_critic(features_extractor=None) - critic_parameters = self.critic.parameters() + critic_parameters = list(self.critic.parameters()) # Critic target should not share the features extractor with critic self.critic_target = self.make_critic(features_extractor=None) self.critic_target.load_state_dict(self.critic.state_dict()) - self.critic.optimizer = self.optimizer_class(critic_parameters, lr=lr_schedule(1), **self.optimizer_kwargs) + self.critic.optimizer = self.optimizer_class( + critic_parameters, + lr=lr_schedule(1), # type: ignore[call-arg] + **self.optimizer_kwargs, + ) # Target networks should always be in eval mode self.critic_target.set_training_mode(False) @@ -386,7 +398,7 @@ class CnnPolicy(SACPolicy): def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, @@ -452,7 +464,7 @@ class MultiInputPolicy(SACPolicy): def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, diff --git a/stable_baselines3/sac/sac.py b/stable_baselines3/sac/sac.py index 9b989850a..de344a453 100644 --- a/stable_baselines3/sac/sac.py +++ b/stable_baselines3/sac/sac.py @@ -2,16 +2,16 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch.nn import functional as F from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm -from stable_baselines3.common.policies import BasePolicy +from stable_baselines3.common.policies import BasePolicy, ContinuousCritic from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule from stable_baselines3.common.utils import get_parameters_by_name, polyak_update -from stable_baselines3.sac.policies import CnnPolicy, MlpPolicy, MultiInputPolicy, SACPolicy +from stable_baselines3.sac.policies import Actor, CnnPolicy, MlpPolicy, MultiInputPolicy, SACPolicy SelfSAC = TypeVar("SelfSAC", bound="SAC") @@ -82,6 +82,10 @@ class SAC(OffPolicyAlgorithm): "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, } + policy: SACPolicy + actor: Actor + critic: ContinuousCritic + critic_target: ContinuousCritic def __init__( self, @@ -137,7 +141,7 @@ def __init__( sde_sample_freq=sde_sample_freq, use_sde_at_warmup=use_sde_at_warmup, optimize_memory_usage=optimize_memory_usage, - supported_action_spaces=(spaces.Box), + supported_action_spaces=(spaces.Box,), support_multi_env=True, ) @@ -147,7 +151,7 @@ def __init__( # Inverse of the reward scale self.ent_coef = ent_coef self.target_update_interval = target_update_interval - self.ent_coef_optimizer = None + self.ent_coef_optimizer: Optional[th.optim.Adam] = None if _init_setup_model: self._setup_model() @@ -161,7 +165,7 @@ def _setup_model(self) -> None: # Target entropy is used when learning the entropy coefficient if self.target_entropy == "auto": # automatically set target entropy if needed - self.target_entropy = -np.prod(self.env.action_space.shape).astype(np.float32) + self.target_entropy = float(-np.prod(self.env.action_space.shape).astype(np.float32)) # type: ignore else: # Force conversion # this will also throw an error for unexpected string @@ -208,7 +212,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: for gradient_step in range(gradient_steps): # Sample replay buffer - replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) + replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) # type: ignore[union-attr] # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: @@ -219,7 +223,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None - if self.ent_coef_optimizer is not None: + if self.ent_coef_optimizer is not None and self.log_ent_coef is not None: # Important: detach the variable from the graph # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 @@ -233,7 +237,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Optimize entropy coefficient, also called # entropy temperature or alpha in the paper - if ent_coef_loss is not None: + if ent_coef_loss is not None and self.ent_coef_optimizer is not None: self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() @@ -255,7 +259,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Compute critic loss critic_loss = 0.5 * sum(F.mse_loss(current_q, target_q_values) for current_q in current_q_values) - critic_losses.append(critic_loss.item()) + assert isinstance(critic_loss, th.Tensor) # for type checker + critic_losses.append(critic_loss.item()) # type: ignore[union-attr] # Optimize the critic self.critic.optimizer.zero_grad() diff --git a/stable_baselines3/td3/policies.py b/stable_baselines3/td3/policies.py index 6c4a1e9c3..12117df89 100644 --- a/stable_baselines3/td3/policies.py +++ b/stable_baselines3/td3/policies.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional, Type, Union import torch as th -from gym import spaces +from gymnasium import spaces from torch import nn from stable_baselines3.common.policies import BasePolicy, ContinuousCritic @@ -35,7 +35,7 @@ class Actor(BasePolicy): def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, net_arch: List[int], features_extractor: nn.Module, features_dim: int, @@ -106,10 +106,15 @@ class TD3Policy(BasePolicy): between the actor and the critic (this saves computation time) """ + actor: Actor + actor_target: Actor + critic: ContinuousCritic + critic_target: ContinuousCritic + def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, @@ -160,8 +165,6 @@ def __init__( } ) - self.actor, self.actor_target = None, None - self.critic, self.critic_target = None, None self.share_features_extractor = share_features_extractor self._build(lr_schedule) @@ -174,7 +177,11 @@ def _build(self, lr_schedule: Schedule) -> None: # Initialize the target to have the same weights as the actor self.actor_target.load_state_dict(self.actor.state_dict()) - self.actor.optimizer = self.optimizer_class(self.actor.parameters(), lr=lr_schedule(1), **self.optimizer_kwargs) + self.actor.optimizer = self.optimizer_class( + self.actor.parameters(), + lr=lr_schedule(1), # type: ignore[call-arg] + **self.optimizer_kwargs, + ) if self.share_features_extractor: self.critic = self.make_critic(features_extractor=self.actor.features_extractor) @@ -190,7 +197,11 @@ def _build(self, lr_schedule: Schedule) -> None: self.critic_target = self.make_critic(features_extractor=None) self.critic_target.load_state_dict(self.critic.state_dict()) - self.critic.optimizer = self.optimizer_class(self.critic.parameters(), lr=lr_schedule(1), **self.optimizer_kwargs) + self.critic.optimizer = self.optimizer_class( + self.critic.parameters(), + lr=lr_schedule(1), # type: ignore[call-arg] + **self.optimizer_kwargs, + ) # Target networks should always be in eval mode self.actor_target.set_training_mode(False) @@ -272,7 +283,7 @@ class CnnPolicy(TD3Policy): def __init__( self, observation_space: spaces.Space, - action_space: spaces.Space, + action_space: spaces.Box, lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, @@ -326,7 +337,7 @@ class MultiInputPolicy(TD3Policy): def __init__( self, observation_space: spaces.Dict, - action_space: spaces.Space, + action_space: spaces.Box, lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, diff --git a/stable_baselines3/td3/td3.py b/stable_baselines3/td3/td3.py index 809d39e33..10cea8efa 100644 --- a/stable_baselines3/td3/td3.py +++ b/stable_baselines3/td3/td3.py @@ -2,16 +2,16 @@ import numpy as np import torch as th -from gym import spaces +from gymnasium import spaces from torch.nn import functional as F from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.noise import ActionNoise from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm -from stable_baselines3.common.policies import BasePolicy +from stable_baselines3.common.policies import BasePolicy, ContinuousCritic from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule from stable_baselines3.common.utils import get_parameters_by_name, polyak_update -from stable_baselines3.td3.policies import CnnPolicy, MlpPolicy, MultiInputPolicy, TD3Policy +from stable_baselines3.td3.policies import Actor, CnnPolicy, MlpPolicy, MultiInputPolicy, TD3Policy SelfTD3 = TypeVar("SelfTD3", bound="TD3") @@ -70,6 +70,11 @@ class TD3(OffPolicyAlgorithm): "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, } + policy: TD3Policy + actor: Actor + actor_target: Actor + critic: ContinuousCritic + critic_target: ContinuousCritic def __init__( self, @@ -120,7 +125,7 @@ def __init__( seed=seed, sde_support=False, optimize_memory_usage=optimize_memory_usage, - supported_action_spaces=(spaces.Box), + supported_action_spaces=(spaces.Box,), support_multi_env=True, ) @@ -157,7 +162,7 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None: for _ in range(gradient_steps): self._n_updates += 1 # Sample replay buffer - replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) + replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) # type: ignore[union-attr] with th.no_grad(): # Select action according to policy and add clipped noise @@ -175,6 +180,7 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None: # Compute critic loss critic_loss = sum(F.mse_loss(current_q, target_q_values) for current_q in current_q_values) + assert isinstance(critic_loss, th.Tensor) critic_losses.append(critic_loss.item()) # Optimize the critics diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt index 9eba1a13a..997bba239 100644 --- a/stable_baselines3/version.txt +++ b/stable_baselines3/version.txt @@ -1 +1 @@ -1.8.1a0 +2.0.0a4 diff --git a/tests/test_buffers.py b/tests/test_buffers.py index 9dc294c6a..825002c92 100644 --- a/tests/test_buffers.py +++ b/tests/test_buffers.py @@ -1,10 +1,11 @@ -import gym +import gymnasium as gym import numpy as np import pytest import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.buffers import DictReplayBuffer, DictRolloutBuffer, ReplayBuffer, RolloutBuffer +from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.type_aliases import DictReplayBufferSamples, ReplayBufferSamples from stable_baselines3.common.utils import get_device @@ -19,7 +20,7 @@ class DummyEnv(gym.Env): def __init__(self): self.action_space = spaces.Box(1, 5, (1,)) self.observation_space = spaces.Box(1, 5, (1,)) - self._observations = [1, 2, 3, 4, 5] + self._observations = np.array([[1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32) self._rewards = [1, 2, 3, 4, 5] self._t = 0 self._ep_length = 100 @@ -27,15 +28,16 @@ def __init__(self): def reset(self): self._t = 0 obs = self._observations[0] - return obs + return obs, {} def step(self, action): self._t += 1 index = self._t % len(self._observations) obs = self._observations[index] - done = self._t >= self._ep_length + terminated = False + truncated = self._t >= self._ep_length reward = self._rewards[index] - return obs, reward, done, {} + return obs, reward, terminated, truncated, {} class DummyDictEnv(gym.Env): @@ -48,7 +50,7 @@ def __init__(self): self.action_space = spaces.Box(1, 5, shape=(10, 7)) space = spaces.Box(1, 5, (1,)) self.observation_space = spaces.Dict({"observation": space, "achieved_goal": space, "desired_goal": space}) - self._observations = [1, 2, 3, 4, 5] + self._observations = np.array([[1.0], [2.0], [3.0], [4.0], [5.0]], dtype=np.float32) self._rewards = [1, 2, 3, 4, 5] self._t = 0 self._ep_length = 100 @@ -56,15 +58,23 @@ def __init__(self): def reset(self): self._t = 0 obs = {key: self._observations[0] for key in self.observation_space.spaces.keys()} - return obs + return obs, {} def step(self, action): self._t += 1 index = self._t % len(self._observations) obs = {key: self._observations[index] for key in self.observation_space.spaces.keys()} - done = self._t >= self._ep_length + terminated = False + truncated = self._t >= self._ep_length reward = self._rewards[index] - return obs, reward, done, {} + return obs, reward, terminated, truncated, {} + + +@pytest.mark.parametrize("env_cls", [DummyEnv, DummyDictEnv]) +def test_env(env_cls): + # Check the env used for testing + # Do not warn for assymetric space + check_env(env_cls(), warn=False, skip_render_check=True) @pytest.mark.parametrize("replay_buffer_cls", [ReplayBuffer, DictReplayBuffer]) diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py index a9bdc431c..f8b0e5486 100644 --- a/tests/test_callbacks.py +++ b/tests/test_callbacks.py @@ -1,7 +1,7 @@ import os import shutil -import gym +import gymnasium as gym import numpy as np import pytest diff --git a/tests/test_cnn.py b/tests/test_cnn.py index 55c55ca9c..e32438c27 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -4,7 +4,7 @@ import numpy as np import pytest import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3 import A2C, DQN, PPO, SAC, TD3 from stable_baselines3.common.envs import FakeImageEnv @@ -45,7 +45,7 @@ def test_cnn(tmp_path, model_class, share_features_extractor): # FakeImageEnv is channel last by default and should be wrapped assert is_vecenv_wrapped(model.get_env(), VecTransposeImage) - obs = env.reset() + obs, _ = env.reset() # Test stochastic predict with channel last input if model_class == DQN: @@ -248,7 +248,7 @@ def test_channel_first_env(tmp_path): assert not is_vecenv_wrapped(model.get_env(), VecTransposeImage) - obs = env.reset() + obs, _ = env.reset() action, _ = model.predict(obs, deterministic=True) diff --git a/tests/test_dict_env.py b/tests/test_dict_env.py index 2c114f613..14777452e 100644 --- a/tests/test_dict_env.py +++ b/tests/test_dict_env.py @@ -1,9 +1,12 @@ -import gym +from typing import Dict, Optional + +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3 +from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.envs import BitFlippingEnv, SimpleMultiObsEnv from stable_baselines3.common.evaluation import evaluate_policy @@ -13,7 +16,7 @@ class DummyDictEnv(gym.Env): """Custom Environment for testing purposes only""" - metadata = {"render.modes": ["human"]} + metadata = {"render_modes": ["human"]} def __init__( self, @@ -66,19 +69,31 @@ def seed(self, seed=None): def step(self, action): reward = 0.0 - done = False - return self.observation_space.sample(), reward, done, {} - - def compute_reward(self, achieved_goal, desired_goal, info): - return np.zeros((len(achieved_goal),)) + terminated = truncated = False + return self.observation_space.sample(), reward, terminated, truncated, {} - def reset(self): - return self.observation_space.sample() + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + self.observation_space.seed(seed) + return self.observation_space.sample(), {} - def render(self, mode="human"): + def render(self): pass +@pytest.mark.parametrize("use_discrete_actions", [True, False]) +@pytest.mark.parametrize("channel_last", [True, False]) +@pytest.mark.parametrize("nested_dict_obs", [True, False]) +@pytest.mark.parametrize("vec_only", [True, False]) +def test_env(use_discrete_actions, channel_last, nested_dict_obs, vec_only): + # Check the env used for testing + if nested_dict_obs: + with pytest.warns(UserWarning, match="Nested observation spaces are not supported"): + check_env(DummyDictEnv(use_discrete_actions, channel_last, nested_dict_obs, vec_only)) + else: + check_env(DummyDictEnv(use_discrete_actions, channel_last, nested_dict_obs, vec_only)) + + @pytest.mark.parametrize("policy", ["MlpPolicy", "CnnPolicy"]) def test_policy_hint(policy): # Common mistake: using the wrong policy @@ -105,7 +120,7 @@ def test_consistency(model_class): dict_env = gym.wrappers.TimeLimit(dict_env, 100) env = gym.wrappers.FlattenObservation(dict_env) dict_env.seed(10) - obs = dict_env.reset() + obs, _ = dict_env.reset() kwargs = {} n_steps = 256 diff --git a/tests/test_distributions.py b/tests/test_distributions.py index e782182f4..48eae12d0 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import Tuple -import gym +import gymnasium as gym import numpy as np import pytest import torch as th diff --git a/tests/test_env_checker.py b/tests/test_env_checker.py index 94aeb3c97..1050e866e 100644 --- a/tests/test_env_checker.py +++ b/tests/test_env_checker.py @@ -1,26 +1,32 @@ -import gym +from typing import Dict, Optional + +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.env_checker import check_env class ActionDictTestEnv(gym.Env): + metadata = {"render_modes": ["human"]} + render_mode = None + action_space = spaces.Dict({"position": spaces.Discrete(1), "velocity": spaces.Discrete(1)}) observation_space = spaces.Box(low=-1.0, high=2.0, shape=(3,), dtype=np.float32) def step(self, action): observation = np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype) reward = 1 - done = True + terminated = True + truncated = False info = {} - return observation, reward, done, info + return observation, reward, terminated, truncated, info def reset(self): - return np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype) + return np.array([1.0, 1.5, 0.5], dtype=self.observation_space.dtype), {} - def render(self, mode="human"): + def render(self): pass @@ -94,12 +100,12 @@ def test_check_env_detailed_error(obs_tuple, method): class TestEnv(gym.Env): action_space = spaces.Box(low=-1.0, high=1.0, shape=(3,), dtype=np.float32) - def reset(self): - return wrong_obs if method == "reset" else good_obs + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + return wrong_obs if method == "reset" else good_obs, {} def step(self, action): obs = wrong_obs if method == "step" else good_obs - return obs, 0.0, True, {} + return obs, 0.0, True, False, {} TestEnv.observation_space = observation_space diff --git a/tests/test_envs.py b/tests/test_envs.py index 1281bb45f..aeb248fbb 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -1,10 +1,10 @@ import types import warnings -import gym +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.envs import ( @@ -75,6 +75,17 @@ def test_bit_flipping(kwargs): # No warnings for custom envs assert len(record) == 0 + # Remove a key, must throw an error + obs_space = env.observation_space.spaces["observation"] + del env.observation_space.spaces["observation"] + with pytest.raises(AssertionError): + check_env(env) + + # Rename a key, must throw an error + env.observation_space.spaces["obs"] = obs_space + with pytest.raises(AssertionError): + check_env(env) + def test_high_dimension_action_space(): """ @@ -87,7 +98,7 @@ def test_high_dimension_action_space(): # Patch to avoid error def patched_step(_action): - return env.observation_space.sample(), 0.0, False, {} + return env.observation_space.sample(), 0.0, False, False, {} env.step = patched_step check_env(env) @@ -110,16 +121,20 @@ def patched_step(_action): spaces.Dict({"position": spaces.Dict({"abs": spaces.Discrete(5), "rel": spaces.Discrete(2)})}), # Small image inside a dict spaces.Dict({"img": spaces.Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8)}), + # Non zero start index + spaces.Discrete(3, start=-1), + # Non zero start index inside a Dict + spaces.Dict({"obs": spaces.Discrete(3, start=1)}), ], ) def test_non_default_spaces(new_obs_space): env = FakeImageEnv() env.observation_space = new_obs_space # Patch methods to avoid errors - env.reset = new_obs_space.sample + env.reset = lambda: (new_obs_space.sample(), {}) def patched_step(_action): - return new_obs_space.sample(), 0.0, False, {} + return new_obs_space.sample(), 0.0, False, False, {} env.step = patched_step with pytest.warns(UserWarning): @@ -145,6 +160,8 @@ def patched_step(_action): spaces.Box(low=-np.inf, high=1, shape=(2,), dtype=np.float32), # Almost good, except for one dim spaces.Box(low=np.array([-1, -1, -1]), high=np.array([1, 1, 0.99]), dtype=np.float32), + # Non zero start index + spaces.Discrete(3, start=-1), ], ) def test_non_default_action_spaces(new_action_space): @@ -155,14 +172,26 @@ def test_non_default_action_spaces(new_action_space): # No warnings for custom envs assert len(record) == 0 + # Change the action space env.action_space = new_action_space + # Discrete action space + if isinstance(new_action_space, spaces.Discrete): + with pytest.warns(UserWarning): + check_env(env) + return + + low, high = new_action_space.low[0], new_action_space.high[0] # Unbounded action space throws an error, # the rest only warning if not np.all(np.isfinite(env.action_space.low)): with pytest.raises(AssertionError), pytest.warns(UserWarning): check_env(env) + # numpy >= 1.21 raises a ValueError + elif int(np.__version__.split(".")[1]) >= 21 and (low > high): + with pytest.raises(ValueError), pytest.warns(UserWarning): + check_env(env) else: with pytest.warns(UserWarning): check_env(env) @@ -176,7 +205,7 @@ def check_reset_assert_error(env, new_reset_return): """ def wrong_reset(): - return new_reset_return + return new_reset_return, {} # Patch the reset method with a wrong one env.reset = wrong_reset @@ -194,6 +223,11 @@ def test_common_failures_reset(): # The observation is not a numpy array check_reset_assert_error(env, 1) + # Return only obs (gym < 0.26) + env.reset = env.observation_space.sample + with pytest.raises(AssertionError): + check_env(env) + # Return not only the observation check_reset_assert_error(env, (env.observation_space.sample(), False)) @@ -206,10 +240,10 @@ def test_common_failures_reset(): wrong_obs = {**env.observation_space.sample(), "extra_key": None} check_reset_assert_error(env, wrong_obs) - obs = env.reset() + obs, _ = env.reset() def wrong_reset(self): - return {"img": obs["img"], "vec": obs["img"]} + return {"img": obs["img"], "vec": obs["img"]}, {} env.reset = types.MethodType(wrong_reset, env) with pytest.raises(AssertionError) as excinfo: @@ -242,33 +276,38 @@ def test_common_failures_step(): env = IdentityEnvBox() # Wrong shape for the observation - check_step_assert_error(env, (np.ones((4,)), 1.0, False, {})) + check_step_assert_error(env, (np.ones((4,)), 1.0, False, False, {})) # Obs is not a numpy array - check_step_assert_error(env, (1, 1.0, False, {})) + check_step_assert_error(env, (1, 1.0, False, False, {})) # Return a wrong reward - check_step_assert_error(env, (env.observation_space.sample(), np.ones(1), False, {})) + check_step_assert_error(env, (env.observation_space.sample(), np.ones(1), False, False, {})) # Info dict is not returned - check_step_assert_error(env, (env.observation_space.sample(), 0.0, False)) + check_step_assert_error(env, (env.observation_space.sample(), 0.0, False, False)) + + # Truncated is not returned (gym < 0.26) + check_step_assert_error(env, (env.observation_space.sample(), 0.0, False, {})) # Done is not a boolean - check_step_assert_error(env, (env.observation_space.sample(), 0.0, 3.0, {})) - check_step_assert_error(env, (env.observation_space.sample(), 0.0, 1, {})) + check_step_assert_error(env, (env.observation_space.sample(), 0.0, 3.0, False, {})) + check_step_assert_error(env, (env.observation_space.sample(), 0.0, 1, False, {})) + # Truncated is not a boolean + check_step_assert_error(env, (env.observation_space.sample(), 0.0, False, 1.0, {})) env = SimpleMultiObsEnv() # Observation keys and observation space keys must match wrong_obs = env.observation_space.sample() wrong_obs.pop("img") - check_step_assert_error(env, (wrong_obs, 0.0, False, {})) + check_step_assert_error(env, (wrong_obs, 0.0, False, False, {})) wrong_obs = {**env.observation_space.sample(), "extra_key": None} - check_step_assert_error(env, (wrong_obs, 0.0, False, {})) + check_step_assert_error(env, (wrong_obs, 0.0, False, False, {})) - obs = env.reset() + obs, _ = env.reset() def wrong_step(self, action): - return {"img": obs["vec"], "vec": obs["vec"]}, 0.0, False, {} + return {"img": obs["vec"], "vec": obs["vec"]}, 0.0, False, False, {} env.step = types.MethodType(wrong_step, env) with pytest.raises(AssertionError) as excinfo: diff --git a/tests/test_gae.py b/tests/test_gae.py index c90470f00..83b95a4c0 100644 --- a/tests/test_gae.py +++ b/tests/test_gae.py @@ -1,11 +1,14 @@ -import gym +from typing import Dict, Optional + +import gymnasium as gym import numpy as np import pytest import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3 import A2C, PPO, SAC from stable_baselines3.common.callbacks import BaseCallback +from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.policies import ActorCriticPolicy @@ -20,20 +23,26 @@ def __init__(self, max_steps=8): def seed(self, seed): self.observation_space.seed(seed) - def reset(self): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + self.observation_space.seed(seed) self.n_steps = 0 - return self.observation_space.sample() + return self.observation_space.sample(), {} def step(self, action): self.n_steps += 1 - done = False + terminated = truncated = False reward = 0.0 if self.n_steps >= self.max_steps: reward = 1.0 - done = True + terminated = True + # To simplify GAE computation checks, + # we do not consider truncation here. + # Truncations are checked in InfiniteHorizonEnv + truncated = False - return self.observation_space.sample(), reward, done, {} + return self.observation_space.sample(), reward, terminated, truncated, {} class InfiniteHorizonEnv(gym.Env): @@ -44,13 +53,16 @@ def __init__(self, n_states=4): self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32) self.current_state = 0 - def reset(self): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + super().reset(seed=seed) + self.current_state = 0 - return self.current_state + return self.current_state, {} def step(self, action): self.current_state = (self.current_state + 1) % self.n_states - return self.current_state, 1.0, False, {} + return self.current_state, 1.0, False, False, {} class CheckGAECallback(BaseCallback): @@ -110,6 +122,12 @@ def forward(self, obs, deterministic=False): return actions, values, log_prob +@pytest.mark.parametrize("env_cls", [CustomEnv, InfiniteHorizonEnv]) +def test_env(env_cls): + # Check the env used for testing + check_env(env_cls(), skip_render_check=True) + + @pytest.mark.parametrize("model_class", [A2C, PPO]) @pytest.mark.parametrize("gae_lambda", [1.0, 0.9]) @pytest.mark.parametrize("gamma", [1.0, 0.99]) diff --git a/tests/test_her.py b/tests/test_her.py index 57a8f0ff0..e17336bc5 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -251,7 +251,7 @@ def env_fn(): train_freq=4, buffer_size=int(2e4), policy_kwargs=dict(net_arch=[64]), - seed=1, + seed=0, ) model.learn(200) old_replay_buffer = deepcopy(model.replay_buffer) diff --git a/tests/test_identity.py b/tests/test_identity.py index cc7746bc7..3118a4d7a 100644 --- a/tests/test_identity.py +++ b/tests/test_identity.py @@ -15,21 +15,17 @@ def test_discrete(model_class, env): env_ = DummyVecEnv([lambda: env]) kwargs = {} - n_steps = 3000 + n_steps = 2500 if model_class == DQN: kwargs = dict(learning_starts=0) - n_steps = 4000 # DQN only support discrete actions if isinstance(env, (IdentityEnvMultiDiscrete, IdentityEnvMultiBinary)): return - elif model_class == A2C: - # slightly higher budget - n_steps = 3500 - model = model_class("MlpPolicy", env_, gamma=0.4, seed=1, **kwargs).learn(n_steps) + model = model_class("MlpPolicy", env_, gamma=0.4, seed=3, **kwargs).learn(n_steps) evaluate_policy(model, env_, n_eval_episodes=20, reward_threshold=90, warn=False) - obs = env.reset() + obs, _ = env.reset() assert np.shape(model.predict(obs)[0]) == np.shape(obs) @@ -38,16 +34,19 @@ def test_discrete(model_class, env): def test_continuous(model_class): env = IdentityEnvBox(eps=0.5) - n_steps = {A2C: 3500, PPO: 3000, SAC: 700, TD3: 500, DDPG: 500}[model_class] + n_steps = {A2C: 2000, PPO: 2000, SAC: 400, TD3: 400, DDPG: 400}[model_class] kwargs = dict(policy_kwargs=dict(net_arch=[64, 64]), seed=0, gamma=0.95) + if model_class in [TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) kwargs["action_noise"] = action_noise elif model_class in [A2C]: kwargs["policy_kwargs"]["log_std_init"] = -0.5 + elif model_class == PPO: + kwargs = dict(n_steps=512, n_epochs=5) - model = model_class("MlpPolicy", env, **kwargs).learn(n_steps) + model = model_class("MlpPolicy", env, learning_rate=1e-3, **kwargs).learn(n_steps) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90, warn=False) diff --git a/tests/test_logger.py b/tests/test_logger.py index 54fc864c8..c33bd4c03 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -5,15 +5,16 @@ from typing import Sequence from unittest import mock -import gym +import gymnasium as gym import numpy as np import pytest import torch as th -from gym import spaces +from gymnasium import spaces from matplotlib import pyplot as plt from pandas.errors import EmptyDataError from stable_baselines3 import A2C, DQN +from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.logger import ( DEBUG, INFO, @@ -352,12 +353,18 @@ def __init__(self, delay: float = 0.01): self.action_space = spaces.Discrete(2) def reset(self): - return self.observation_space.sample() + return self.observation_space.sample(), {} def step(self, action): time.sleep(self.delay) obs = self.observation_space.sample() - return obs, 0.0, True, {} + return obs, 0.0, True, False, {} + + +@pytest.mark.parametrize("env_cls", [TimeDelayEnv]) +def test_env(env_cls): + # Check the env used for testing + check_env(env_cls(), skip_render_check=True) class InMemoryLogger(Logger): diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 17002f39a..2428d2216 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -2,7 +2,7 @@ import os import uuid -import gym +import gymnasium as gym import pandas from stable_baselines3.common.monitor import Monitor, get_monitor_files, load_results @@ -13,7 +13,7 @@ def test_monitor(tmp_path): Test the monitor wrapper """ env = gym.make("CartPole-v1") - env.seed(0) + env.reset(seed=0) monitor_file = os.path.join(str(tmp_path), f"stable_baselines-test-{uuid.uuid4()}.monitor.csv") monitor_env = Monitor(env, monitor_file) monitor_env.reset() @@ -22,10 +22,10 @@ def test_monitor(tmp_path): ep_lengths = [] ep_len, ep_reward = 0, 0 for _ in range(total_steps): - _, reward, done, _ = monitor_env.step(monitor_env.action_space.sample()) + _, reward, terminated, truncated, _ = monitor_env.step(monitor_env.action_space.sample()) ep_len += 1 ep_reward += reward - if done: + if terminated or truncated: ep_rewards.append(ep_reward) ep_lengths.append(ep_len) monitor_env.reset() @@ -64,7 +64,7 @@ def test_monitor_load_results(tmp_path): """ tmp_path = str(tmp_path) env1 = gym.make("CartPole-v1") - env1.seed(0) + env1.reset(seed=0) monitor_file1 = os.path.join(tmp_path, f"stable_baselines-test-{uuid.uuid4()}.monitor.csv") monitor_env1 = Monitor(env1, monitor_file1) @@ -75,8 +75,8 @@ def test_monitor_load_results(tmp_path): monitor_env1.reset() episode_count1 = 0 for _ in range(1000): - _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample()) - if done: + _, _, terminated, truncated, _ = monitor_env1.step(monitor_env1.action_space.sample()) + if terminated or truncated: episode_count1 += 1 monitor_env1.reset() @@ -84,7 +84,7 @@ def test_monitor_load_results(tmp_path): assert results_size1 == episode_count1 env2 = gym.make("CartPole-v1") - env2.seed(0) + env2.reset(seed=0) monitor_file2 = os.path.join(tmp_path, f"stable_baselines-test-{uuid.uuid4()}.monitor.csv") monitor_env2 = Monitor(env2, monitor_file2) monitor_files = get_monitor_files(tmp_path) @@ -98,8 +98,8 @@ def test_monitor_load_results(tmp_path): monitor_env2 = Monitor(env2, monitor_file2, override_existing=False) monitor_env2.reset() for _ in range(1000): - _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample()) - if done: + _, _, terminated, truncated, _ = monitor_env2.step(monitor_env2.action_space.sample()) + if terminated or truncated: episode_count2 += 1 monitor_env2.reset() diff --git a/tests/test_predict.py b/tests/test_predict.py index 579abff77..247fe9172 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -1,10 +1,11 @@ -import gym +import gymnasium as gym import numpy as np import pytest import torch as th -from gym import spaces +from gymnasium import spaces from stable_baselines3 import A2C, DQN, PPO, SAC, TD3 +from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.envs import IdentityEnv from stable_baselines3.common.utils import get_device from stable_baselines3.common.vec_env import DummyVecEnv @@ -30,10 +31,16 @@ def __init__(self): self.action_space = SubClassedBox(-1, 1, shape=(2,), dtype=np.float32) def reset(self): - return self.observation_space.sample() + return self.observation_space.sample(), {} def step(self, action): - return self.observation_space.sample(), 0.0, np.random.rand() > 0.5, {} + return self.observation_space.sample(), 0.0, np.random.rand() > 0.5, False, {} + + +@pytest.mark.parametrize("env_cls", [CustomSubClassedSpaceEnv]) +def test_env(env_cls): + # Check the env used for testing + check_env(env_cls(), skip_render_check=True) @pytest.mark.parametrize("model_class", MODEL_LIST) @@ -70,7 +77,7 @@ def test_predict(model_class, env_id, device): env = gym.make(env_id) vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)]) - obs = env.reset() + obs, _ = env.reset() action, _ = model.predict(obs) assert isinstance(action, np.ndarray) assert action.shape == env.action_space.shape @@ -96,7 +103,7 @@ def test_dqn_epsilon_greedy(): env = IdentityEnv(2) model = DQN("MlpPolicy", env) model.exploration_rate = 1.0 - obs = env.reset() + obs, _ = env.reset() # is vectorized should not crash with discrete obs action, _ = model.predict(obs, deterministic=False) assert env.action_space.contains(action) @@ -107,5 +114,5 @@ def test_subclassed_space_env(model_class): env = CustomSubClassedSpaceEnv() model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[32])) model.learn(300) - obs = env.reset() + obs, _ = env.reset() env.step(model.predict(obs)) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 89f869b45..b8a5891c7 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,5 +1,5 @@ import torch -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.preprocessing import get_obs_shape, preprocess_obs diff --git a/tests/test_run.py b/tests/test_run.py index ca7548ff9..31c7b956e 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,4 +1,4 @@ -import gym +import gymnasium as gym import numpy as np import pytest diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 9d3d537b7..2f227adf6 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -8,7 +8,7 @@ from collections import OrderedDict from copy import deepcopy -import gym +import gymnasium as gym import numpy as np import pytest import torch as th diff --git a/tests/test_spaces.py b/tests/test_spaces.py index 6dd6dc419..6d18fcef8 100644 --- a/tests/test_spaces.py +++ b/tests/test_spaces.py @@ -1,9 +1,12 @@ -import gym +from typing import Dict, Optional + +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3 +from stable_baselines3.common.env_checker import check_env from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.evaluation import evaluate_policy @@ -14,11 +17,13 @@ def __init__(self, nvec): self.observation_space = spaces.MultiDiscrete(nvec) self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32) - def reset(self): - return self.observation_space.sample() + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + super().reset(seed=seed) + return self.observation_space.sample(), {} def step(self, action): - return self.observation_space.sample(), 0.0, False, {} + return self.observation_space.sample(), 0.0, False, False, {} class DummyMultiBinary(gym.Env): @@ -27,11 +32,13 @@ def __init__(self, n): self.observation_space = spaces.MultiBinary(n) self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32) - def reset(self): - return self.observation_space.sample() + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + super().reset(seed=seed) + return self.observation_space.sample(), {} def step(self, action): - return self.observation_space.sample(), 0.0, False, {} + return self.observation_space.sample(), 0.0, False, False, {} class DummyMultidimensionalAction(gym.Env): @@ -41,10 +48,16 @@ def __init__(self): self.action_space = spaces.Box(low=-1, high=1, shape=(2, 2), dtype=np.float32) def reset(self): - return self.observation_space.sample() + return self.observation_space.sample(), {} def step(self, action): - return self.observation_space.sample(), 0.0, False, {} + return self.observation_space.sample(), 0.0, False, False, {} + + +@pytest.mark.parametrize("env", [DummyMultiDiscreteSpace([4, 3]), DummyMultiBinary(8), DummyMultiBinary((3, 2))]) +def test_env(env): + # Check the env used for testing + check_env(env, skip_render_check=True) @pytest.mark.parametrize("model_class", [SAC, TD3, DQN]) diff --git a/tests/test_train_eval_mode.py b/tests/test_train_eval_mode.py index f3a012fd4..dcbda74e1 100644 --- a/tests/test_train_eval_mode.py +++ b/tests/test_train_eval_mode.py @@ -1,6 +1,6 @@ from typing import Union -import gym +import gymnasium as gym import numpy as np import pytest import torch as th diff --git a/tests/test_utils.py b/tests/test_utils.py index 88de942e5..02128edb7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,11 +1,11 @@ import os import shutil -import gym +import gymnasium as gym import numpy as np import pytest import torch as th -from gym import spaces +from gymnasium import spaces import stable_baselines3 as sb3 from stable_baselines3 import A2C @@ -28,7 +28,7 @@ @pytest.mark.parametrize("env_id", ["CartPole-v1", lambda: gym.make("CartPole-v1")]) @pytest.mark.parametrize("n_envs", [1, 2]) @pytest.mark.parametrize("vec_env_cls", [None, SubprocVecEnv]) -@pytest.mark.parametrize("wrapper_class", [None, gym.wrappers.TimeLimit]) +@pytest.mark.parametrize("wrapper_class", [None, gym.wrappers.RecordEpisodeStatistics]) def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, wrapper_class=wrapper_class, monitor_dir=None, seed=0) @@ -194,7 +194,7 @@ def dummy_callback(locals_, _globals): policy.n_callback_calls = 0 # type: ignore[assignment, attr-defined] _, episode_lengths = evaluate_policy( policy, # type: ignore[arg-type] - model.get_env(), + model.get_env(), # type: ignore[arg-type] n_eval_episodes, deterministic=True, render=False, @@ -213,7 +213,7 @@ def dummy_callback(locals_, _globals): episode_rewards, _ = evaluate_policy( policy, # type: ignore[arg-type] - model.get_env(), + model.get_env(), # type: ignore[arg-type] n_eval_episodes, return_episode_rewards=True, ) @@ -239,17 +239,18 @@ def __init__(self, env): self.needs_reset = True def step(self, action): - obs, reward, done, info = self.env.step(action) - self.needs_reset = done + obs, reward, terminated, truncated, info = self.env.step(action) + self.needs_reset = terminated or truncated self.last_obs = obs - return obs, reward, True, info + return obs, reward, True, truncated, info def reset(self, **kwargs): + info = {} if self.needs_reset: - obs = self.env.reset(**kwargs) + obs, info = self.env.reset(**kwargs) self.last_obs = obs self.needs_reset = False - return self.last_obs + return self.last_obs, info @pytest.mark.parametrize("n_envs", [1, 2, 5, 7]) diff --git a/tests/test_vec_check_nan.py b/tests/test_vec_check_nan.py index 962355782..1253be6e5 100644 --- a/tests/test_vec_check_nan.py +++ b/tests/test_vec_check_nan.py @@ -1,7 +1,7 @@ -import gym +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan @@ -24,13 +24,13 @@ def step(action): obs = float("inf") else: obs = 0 - return [obs], 0.0, False, {} + return [obs], 0.0, False, False, {} @staticmethod def reset(): - return [0.0] + return [0.0], {} - def render(self, mode="human", close=False): + def render(self): pass diff --git a/tests/test_vec_envs.py b/tests/test_vec_envs.py index ae05947c5..6bc7e74db 100644 --- a/tests/test_vec_envs.py +++ b/tests/test_vec_envs.py @@ -2,12 +2,16 @@ import functools import itertools import multiprocessing +import os +import warnings +from typing import Dict, Optional -import gym +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces +from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecFrameStack, VecNormalize @@ -17,7 +21,7 @@ class CustomGymEnv(gym.Env): - def __init__(self, space): + def __init__(self, space, render_mode: str = "rgb_array"): """ Custom gym environment for testing purposes """ @@ -25,24 +29,28 @@ def __init__(self, space): self.observation_space = space self.current_step = 0 self.ep_length = 4 + self.render_mode = render_mode - def reset(self): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + self.seed(seed) self.current_step = 0 self._choose_next_state() - return self.state + return self.state, {} def step(self, action): reward = float(np.random.rand()) self._choose_next_state() self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} + terminated = False + truncated = self.current_step >= self.ep_length + return self.state, reward, terminated, truncated, {} def _choose_next_state(self): self.state = self.observation_space.sample() - def render(self, mode="human"): - if mode == "rgb_array": + def render(self): + if self.render_mode == "rgb_array": return np.zeros((4, 4, 3)) def seed(self, seed=None): @@ -91,9 +99,20 @@ def make_env(): # Test seed method vec_env.seed(0) + # Test render method call - # vec_env.render() # we need a X server to test the "human" mode - vec_env.render(mode="rgb_array") + array_explicit_mode = vec_env.render(mode="rgb_array") + # test render without argument (new gym API style) + array_implicit_mode = vec_env.render() + assert np.array_equal(array_implicit_mode, array_explicit_mode) + + # test warning if you try different render mode + with pytest.warns(UserWarning): + vec_env.render(mode="something_else") + + # we need a X server to test the "human" mode (uses OpenCV) + # vec_env.render(mode="human") + env_method_results = vec_env.env_method("custom_method", 1, indices=None, dim_1=2) setattr_results = [] # Set current_step to an arbitrary value @@ -155,13 +174,14 @@ def __init__(self, max_steps): def reset(self): self.current_step = 0 - return np.array([self.current_step], dtype="int") + return np.array([self.current_step], dtype="int"), {} def step(self, action): prev_step = self.current_step self.current_step += 1 - done = self.current_step >= self.max_steps - return np.array([prev_step], dtype="int"), 0.0, done, {} + terminated = False + truncated = self.current_step >= self.max_steps + return np.array([prev_step], dtype="int"), 0.0, terminated, truncated, {} @pytest.mark.parametrize("vec_env_class", VEC_ENV_CLASSES) @@ -455,6 +475,23 @@ def make_monitored_env(): assert vec_env.env_is_wrapped(Monitor) == [False, True] +@pytest.mark.parametrize("vec_env_class", VEC_ENV_CLASSES) +def test_backward_compat_seed(vec_env_class): + def make_env(): + env = CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2))) + # Patch reset function to remove seed param + env.reset = lambda: (env.observation_space.sample(), {}) + env.seed = env.observation_space.seed + return env + + vec_env = vec_env_class([make_env for _ in range(N_ENVS)]) + vec_env.seed(3) + obs = vec_env.reset() + vec_env.seed(3) + new_obs = vec_env.reset() + assert np.allclose(new_obs, obs) + + @pytest.mark.parametrize("vec_env_class", VEC_ENV_CLASSES) def test_vec_seeding(vec_env_class): def make_env(): @@ -484,3 +521,63 @@ def make_env(): assert not np.allclose(rewards[1], rewards[2]) vec_env.close() + + +@pytest.mark.parametrize("vec_env_class", VEC_ENV_CLASSES) +def test_render(vec_env_class): + # Skip if no X-Server + if not os.environ.get("DISPLAY"): + pytest.skip("No X-Server") + + env_id = "Pendulum-v1" + # DummyVecEnv human render is currently + # buggy because of gym: + # https://github.com/carlosluis/stable-baselines3/pull/3#issuecomment-1356863808 + n_envs = 2 + # Human render + vec_env = make_vec_env( + env_id, + n_envs, + vec_env_cls=vec_env_class, + env_kwargs=dict(render_mode="human"), + ) + + vec_env.reset() + vec_env.render() + + with pytest.warns(UserWarning): + vec_env.render("rgb_array") + + with pytest.warns(UserWarning): + vec_env.render(mode="blah") + + for _ in range(10): + vec_env.step([vec_env.action_space.sample() for _ in range(n_envs)]) + vec_env.render() + + vec_env.close() + # rgb_array render, which allows human_render + # thanks to OpenCV + vec_env = make_vec_env( + env_id, + n_envs, + vec_env_cls=vec_env_class, + env_kwargs=dict(render_mode="rgb_array"), + ) + + vec_env.reset() + with warnings.catch_warnings(record=True) as record: + vec_env.render() + vec_env.render("rgb_array") + vec_env.render(mode="human") + + # No warnings for using human mode + assert len(record) == 0 + + with pytest.warns(UserWarning): + vec_env.render(mode="blah") + + for _ in range(10): + vec_env.step([vec_env.action_space.sample() for _ in range(n_envs)]) + vec_env.render() + vec_env.close() diff --git a/tests/test_vec_extract_dict_obs.py b/tests/test_vec_extract_dict_obs.py index 17728bbd3..8c8dccd47 100644 --- a/tests/test_vec_extract_dict_obs.py +++ b/tests/test_vec_extract_dict_obs.py @@ -1,5 +1,5 @@ import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3 import PPO from stable_baselines3.common.vec_env import VecExtractDictObs, VecMonitor @@ -41,7 +41,7 @@ def reset(self): self.n_steps = 0 return {"rgb": np.zeros((self.num_envs, 86, 86))} - def render(self, mode="human", close=False): + def render(self, close=False): pass diff --git a/tests/test_vec_monitor.py b/tests/test_vec_monitor.py index 0a146a057..1a0e94d90 100644 --- a/tests/test_vec_monitor.py +++ b/tests/test_vec_monitor.py @@ -2,8 +2,9 @@ import json import os import uuid +import warnings -import gym +import gymnasium as gym import pandas import pytest @@ -132,8 +133,9 @@ def test_vec_monitor_ppo(recwarn): """ Test the `VecMonitor` with PPO """ + warnings.filterwarnings(action="ignore", category=DeprecationWarning, module=r".*passive_env_checker") env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) - env.seed(0) + env.seed(seed=0) monitor_env = VecMonitor(env) model = PPO("MlpPolicy", monitor_env, verbose=1, n_steps=64, device="cpu") model.learn(total_timesteps=250) diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 27bba9aad..ae5904795 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -1,10 +1,10 @@ import operator -from typing import Any, Dict +from typing import Any, Dict, Optional -import gym +import gymnasium as gym import numpy as np import pytest -from gym import spaces +from gymnasium import spaces from stable_baselines3 import SAC, TD3, HerReplayBuffer from stable_baselines3.common.envs import FakeImageEnv @@ -35,11 +35,15 @@ def step(self, action): self.t += 1 index = (self.t + self.return_reward_idx) % len(self.returned_rewards) returned_value = self.returned_rewards[index] - return np.array([returned_value]), returned_value, self.t == len(self.returned_rewards), {} + terminated = False + truncated = self.t == len(self.returned_rewards) + return np.array([returned_value]), returned_value, terminated, truncated, {} - def reset(self): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + super().reset(seed=seed) self.t = 0 - return np.array([self.returned_rewards[self.return_reward_idx]]) + return np.array([self.returned_rewards[self.return_reward_idx]]), {} class DummyDictEnv(gym.Env): @@ -58,14 +62,16 @@ def __init__(self): ) self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) - def reset(self): - return self.observation_space.sample() + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + super().reset(seed=seed) + return self.observation_space.sample(), {} def step(self, action): obs = self.observation_space.sample() reward = self.compute_reward(obs["achieved_goal"], obs["desired_goal"], {}) - done = np.random.rand() > 0.8 - return obs, reward, done, {} + terminated = np.random.rand() > 0.8 + return obs, reward, terminated, False, {} def compute_reward(self, achieved_goal: np.ndarray, desired_goal: np.ndarray, _info) -> np.float32: distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1) @@ -88,13 +94,15 @@ def __init__(self): ) self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32) - def reset(self): - return self.observation_space.sample() + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + if seed is not None: + super().reset(seed=seed) + return self.observation_space.sample(), {} def step(self, action): obs = self.observation_space.sample() - done = np.random.rand() > 0.8 - return obs, 0.0, done, {} + terminated = np.random.rand() > 0.8 + return obs, 0.0, terminated, False, {} def allclose(obs_1, obs_2): diff --git a/tests/test_vec_stacked_obs.py b/tests/test_vec_stacked_obs.py index 0a7aa39f1..4b2c61444 100644 --- a/tests/test_vec_stacked_obs.py +++ b/tests/test_vec_stacked_obs.py @@ -1,5 +1,5 @@ import numpy as np -from gym import spaces +from gymnasium import spaces from stable_baselines3.common.vec_env.stacked_observations import StackedObservations