diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e48b065e4ea..4c376ad0c26 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,6 @@ jobs: - uses: actions/checkout@v2 - run: | docker build -f py.Dockerfile \ - --build-arg MUJOCO_KEY=$MUJOCO_KEY \ --build-arg PYTHON_VERSION=${{ matrix.python-version }} \ --tag gym-docker . - name: Run tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea16d0eae53..89a02ac19f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: hooks: - id: codespell args: - - --ignore-words-list=nd,reacher,thist,ths + - --ignore-words-list=nd,reacher,thist,ths, ure, referenc - repo: https://gitlab.com/PyCQA/flake8 rev: 4.0.1 hooks: diff --git a/README.md b/README.md index 37b63f2d162..d6649dfd2dd 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,11 @@ env.close() Gym keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion. +## MuJoCo Environments + +The latest "\_v4" and future versions of the MuJoCo environments will no longer depend on `mujoco-py`. Instead `mujoco` will be the required dependency for future gym MuJoCo environment versions. Old gym MuJoCo environment versions that depend on `mujoco-py` will still be kept but unmaintained. +To install the dependencies for the latest gym MuJoCo environments use `pip install gym[mujoco]`. Dependencies for old MuJoCo environments can still be installed by `pip install gym[mujoco_py]`. + ## Citation A whitepaper from when Gym just came out is available https://arxiv.org/pdf/1606.01540, and can be cited with the following bibtex entry: diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 48a208ded37..3acec6db18c 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -162,6 +162,13 @@ reward_threshold=-3.75, ) +register( + id="Reacher-v4", + entry_point="gym.envs.mujoco.reacher_v4:ReacherEnv", + max_episode_steps=50, + reward_threshold=-3.75, +) + register( id="Pusher-v2", entry_point="gym.envs.mujoco:PusherEnv", @@ -169,6 +176,13 @@ reward_threshold=0.0, ) +register( + id="Pusher-v4", + entry_point="gym.envs.mujoco.pusher_v4:PusherEnv", + max_episode_steps=100, + reward_threshold=0.0, +) + register( id="InvertedPendulum-v2", entry_point="gym.envs.mujoco:InvertedPendulumEnv", @@ -176,6 +190,13 @@ reward_threshold=950.0, ) +register( + id="InvertedPendulum-v4", + entry_point="gym.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv", + max_episode_steps=1000, + reward_threshold=950.0, +) + register( id="InvertedDoublePendulum-v2", entry_point="gym.envs.mujoco:InvertedDoublePendulumEnv", @@ -183,6 +204,13 @@ reward_threshold=9100.0, ) +register( + id="InvertedDoublePendulum-v4", + entry_point="gym.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv", + max_episode_steps=1000, + reward_threshold=9100.0, +) + register( id="HalfCheetah-v2", entry_point="gym.envs.mujoco:HalfCheetahEnv", @@ -197,6 +225,13 @@ reward_threshold=4800.0, ) +register( + id="HalfCheetah-v4", + entry_point="gym.envs.mujoco.half_cheetah_v4:HalfCheetahEnv", + max_episode_steps=1000, + reward_threshold=4800.0, +) + register( id="Hopper-v2", entry_point="gym.envs.mujoco:HopperEnv", @@ -211,6 +246,13 @@ reward_threshold=3800.0, ) +register( + id="Hopper-v4", + entry_point="gym.envs.mujoco.hopper_v4:HopperEnv", + max_episode_steps=1000, + reward_threshold=3800.0, +) + register( id="Swimmer-v2", entry_point="gym.envs.mujoco:SwimmerEnv", @@ -225,6 +267,13 @@ reward_threshold=360.0, ) +register( + id="Swimmer-v4", + entry_point="gym.envs.mujoco.swimmer_v4:SwimmerEnv", + max_episode_steps=1000, + reward_threshold=360.0, +) + register( id="Walker2d-v2", max_episode_steps=1000, @@ -237,6 +286,12 @@ entry_point="gym.envs.mujoco.walker2d_v3:Walker2dEnv", ) +register( + id="Walker2d-v4", + max_episode_steps=1000, + entry_point="gym.envs.mujoco.walker2d_v4:Walker2dEnv", +) + register( id="Ant-v2", entry_point="gym.envs.mujoco:AntEnv", @@ -251,6 +306,13 @@ reward_threshold=6000.0, ) +register( + id="Ant-v4", + entry_point="gym.envs.mujoco.ant_v4:AntEnv", + max_episode_steps=1000, + reward_threshold=6000.0, +) + register( id="Humanoid-v2", entry_point="gym.envs.mujoco:HumanoidEnv", @@ -263,8 +325,20 @@ max_episode_steps=1000, ) +register( + id="Humanoid-v4", + entry_point="gym.envs.mujoco.humanoid_v4:HumanoidEnv", + max_episode_steps=1000, +) + register( id="HumanoidStandup-v2", entry_point="gym.envs.mujoco:HumanoidStandupEnv", max_episode_steps=1000, ) + +register( + id="HumanoidStandup-v4", + entry_point="gym.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv", + max_episode_steps=1000, +) diff --git a/gym/envs/mujoco/__init__.py b/gym/envs/mujoco/__init__.py index fc4cc058be1..57658054276 100644 --- a/gym/envs/mujoco/__init__.py +++ b/gym/envs/mujoco/__init__.py @@ -9,6 +9,7 @@ from gym.envs.mujoco.humanoidstandup import HumanoidStandupEnv from gym.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv from gym.envs.mujoco.inverted_pendulum import InvertedPendulumEnv +from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer from gym.envs.mujoco.pusher import PusherEnv from gym.envs.mujoco.reacher import ReacherEnv from gym.envs.mujoco.swimmer import SwimmerEnv diff --git a/gym/envs/mujoco/ant.py b/gym/envs/mujoco/ant.py index e61b787db98..70728dee5ea 100644 --- a/gym/envs/mujoco/ant.py +++ b/gym/envs/mujoco/ant.py @@ -6,7 +6,7 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5) + mujoco_env.MujocoEnv.__init__(self, "ant.xml", 5, mujoco_bindings="mujoco_py") utils.EzPickle.__init__(self) def step(self, a): diff --git a/gym/envs/mujoco/ant_v3.py b/gym/envs/mujoco/ant_v3.py index aeffa507523..4260fd7d2f0 100644 --- a/gym/envs/mujoco/ant_v3.py +++ b/gym/envs/mujoco/ant_v3.py @@ -9,167 +9,6 @@ class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment is based on the environment introduced by Schulman, - Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control - Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). - The ant is a 3D robot consisting of one torso (free rotational body) with - four legs attached to it with each leg having two links. The goal is to - coordinate the four legs to move in the forward (right) direction by applying - torques on the eight hinges connecting the two links of each leg and the torso - (nine parts and eight hinges). - - ### Action Space - The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | - | 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | - | 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | - | 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | - | 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | - - ### Observation Space - - Observations consist of positional values of different body parts of the ant, - followed by the velocities of those individual parts (their derivatives) with all - the positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the ant's torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 113 dimensions where the first two dimensions - represent the x- and y- coordinates of the ant's torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, an observation is a `ndarray` with shape `(111,)` - where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 21 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 26 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - - - The remaining 14*6 = 84 elements of the observation are contact forces - (external forces - force x, y, z and torque x, y, z) applied to the - center of mass of each of the links. The 14 links are: the ground link, - the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces. - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - - **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results - in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 - when using the Ant environment if you would like to report results with contact forces (if - contact forces are not used in your experiments, you can use version > 2.0). - - ### Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward` - - *forward_reward*: A reward of moving forward which is measured as - *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time - between actions and is dependent on the `frame_skip` parameter (default is 5), - where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*. - This reward would be positive if the ant moves forward (in positive x direction). - - *ctrl_cost*: A negative reward for penalising the ant if it takes actions - that are too large. It is measured as *`ctrl_cost_weight` * sum(action2)* - where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5. - - *contact_cost*: A negative reward for penalising the ant if the external contact - force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact - force to `contact_force_range`)2)*. - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms. - - ### Starting State - All observations start in state - (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise - with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for - stochasticity. Note that the initial z coordinate is intentionally selected - to be slightly high, thereby indicating a standing up ant. The initial orientation - is designed to make it face forward as well. - - ### Episode Termination - The ant is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode terminates when any of the following happens: - - 1. The episode duration reaches a 1000 timesteps - 2. The ant is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded. - - ### Arguments - - No additional arguments are currently supported in v2 and lower. - - ``` - env = gym.make('Ant-v2') - ``` - - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ``` - env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|--------------|-------------------------------| - | `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model | - | `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) | - | `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) | - | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | - | `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range | - | `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ### Version History - - * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - def __init__( self, xml_file="ant.xml", @@ -199,7 +38,7 @@ def __init__( exclude_current_positions_from_observation ) - mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py") @property def healthy_reward(self): diff --git a/gym/envs/mujoco/ant_v4.py b/gym/envs/mujoco/ant_v4.py new file mode 100644 index 00000000000..0b8f2bbd785 --- /dev/null +++ b/gym/envs/mujoco/ant_v4.py @@ -0,0 +1,308 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + +DEFAULT_CAMERA_CONFIG = { + "distance": 4.0, +} + + +class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment is based on the environment introduced by Schulman, + Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control + Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). + The ant is a 3D robot consisting of one torso (free rotational body) with + four legs attached to it with each leg having two links. The goal is to + coordinate the four legs to move in the forward (right) direction by applying + torques on the eight hinges connecting the two links of each leg and the torso + (nine parts and eight hinges). + + ### Action Space + The agent take a 8-element vector for actions. + + The action space is a continuous `(action, action, action, action, action, action, + action, action)` all in `[-1, 1]`, where `action` represents the numerical torques + applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| + | 0 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | + | 1 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | + | 2 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | + | 3 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | + | 7 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | + + ### Observation Space + + The state space consists of positional values of different body parts of the ant, + followed by the velocities of those individual parts (their derivatives) with all + the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(111,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|-------------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| + | 0 | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | 1 | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | 2 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | 3 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 4 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 5 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 6 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 7 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 8 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 9 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 10 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 11 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 12 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 13 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 14 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | 15 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 16 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 17 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 18 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 19 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 20 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 21 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 22 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 23 | angular velocity of angle between torso and front right link| -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 24 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 25 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 26 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 27 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 28 |angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + + + The remaining 14*6 = 84 elements in the state are contact forces + (external forces - force x, y, z and torque x, y, z) applied to the + center of mass of each of the links. The 14 links are: the ground link, + the torso link, and 3 links for each leg (1 + 1 + 12) with the 6 external forces. + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** There are 29 elements in the table above - giving rise to `(113,)` elements + in the state space. In practice (and Gym implementation), the first two positional + elements are omitted from the state space since the reward function is calculated based + on the x-coordinate value. This value is hidden from the algorithm, which in turn has to + develop an abstract understanding of it from the observed rewards. Therefore, observation + space has shape `(111,)` instead of `(113,)` and the table should not have the first two rows. + + **Note:** Ant-v4 environment no longer has the following contact forces issue. + If using previous Ant versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results + in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 + when using the Ant environment if you would like to report results with contact forces (if + contact forces are not used in your experiments, you can use version > 2.0). + + **Note:** Ant-v4 has the option of including contact forces in the observation space. To add contact forces set the argument + 'use_contact_forces" to True. The default value is False. Also note that training including contact forces can perform worse + than not using them as shown in (https://github.com/openai/gym/pull/2762). + + ### Rewards + The reward consists of three parts: + - *survive_reward*: Every timestep that the ant is alive, it gets a reward of 1. + - *forward_reward*: A reward of moving forward which is measured as + *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time + between actions and is dependent on the frame_skip parameter (default is 5), + where the *dt* for one frame is 0.01 - making the default *dt = 5 * 0.01 = 0.05*. + This reward would be positive if the ant moves forward (right) desired. + - *ctrl_cost*: A negative reward for penalising the ant if it takes actions + that are too large. It is measured as *coefficient **x** sum(action2)* + where *coefficient* is a parameter set for the control and has a default value of 0.5. + - *contact_cost*: A negative reward for penalising the ant if the external contact + force is too large. It is calculated *0.5 * 0.001 * sum(clip(external contact + force to [-1,1])2)*. + + The total reward returned is ***reward*** *=* *alive survive_reward + forward_reward - ctrl_cost - contact_cost* + + ### Starting State + All observations start in state + (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range + of [-0.1, 0.1] added to the positional values and standard normal noise + with 0 mean and 0.1 standard deviation added to the velocity values for + stochasticity. Note that the initial z coordinate is intentionally selected + to be slightly high, thereby indicating a standing up ant. The initial orientation + is designed to make it face forward as well. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 1000 timesteps + 2. Any of the state space values is no longer finite + 3. The y-orientation (index 2) in the state is **not** in the range `[0.2, 1.0]` + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but modifications + can be made to the XML file in the assets folder (or by changing the path to a modified + XML file in another folder). + + ``` + env = gym.make('Ant-v2') + ``` + + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + ``` + env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ...) + ``` + + ### Version History + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + def __init__( + self, + xml_file="ant.xml", + ctrl_cost_weight=0.5, + use_contact_forces=False, + contact_cost_weight=5e-4, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.2, 1.0), + contact_force_range=(-1.0, 1.0), + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True, + ): + utils.EzPickle.__init__(**locals()) + + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._contact_force_range = contact_force_range + + self._reset_noise_scale = reset_noise_scale + + self._use_contact_forces = use_contact_forces + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + @property + def healthy_reward(self): + return ( + float(self.is_healthy or self._terminate_when_unhealthy) + * self._healthy_reward + ) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def contact_forces(self): + raw_contact_forces = self.data.cfrc_ext + min_value, max_value = self._contact_force_range + contact_forces = np.clip(raw_contact_forces, min_value, max_value) + return contact_forces + + @property + def contact_cost(self): + contact_cost = self._contact_cost_weight * np.sum( + np.square(self.contact_forces) + ) + return contact_cost + + @property + def is_healthy(self): + state = self.state_vector() + min_z, max_z = self._healthy_z_range + is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z + return is_healthy + + @property + def done(self): + done = not self.is_healthy if self._terminate_when_unhealthy else False + return done + + def step(self, action): + xy_position_before = self.get_body_com("torso")[:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.get_body_com("torso")[:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + + costs = ctrl_cost = self.control_cost(action) + + done = self.done + observation = self._get_obs() + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_survive": healthy_reward, + "x_position": xy_position_after[0], + "y_position": xy_position_after[1], + "distance_from_origin": np.linalg.norm(xy_position_after, ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + "forward_reward": forward_reward, + } + if self._use_contact_forces: + contact_cost = self.contact_cost + costs += contact_cost + info["reward_ctrl"] = -contact_cost + + reward = rewards - costs + + return observation, reward, done, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + if self._use_contact_forces: + contact_force = self.contact_forces.flat.copy() + return np.concatenate((position, velocity, contact_force)) + else: + return np.concatenate((position, velocity)) + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = ( + self.init_qvel + + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv) + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/assets/inverted_pendulum.xml b/gym/envs/mujoco/assets/inverted_pendulum.xml index 396a0b3499a..85145042985 100644 --- a/gym/envs/mujoco/assets/inverted_pendulum.xml +++ b/gym/envs/mujoco/assets/inverted_pendulum.xml @@ -22,6 +22,6 @@ - + \ No newline at end of file diff --git a/gym/envs/mujoco/half_cheetah.py b/gym/envs/mujoco/half_cheetah.py index 53a206fb656..2c32de0b0a9 100644 --- a/gym/envs/mujoco/half_cheetah.py +++ b/gym/envs/mujoco/half_cheetah.py @@ -6,7 +6,9 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5) + mujoco_env.MujocoEnv.__init__( + self, "half_cheetah.xml", 5, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def step(self, action): diff --git a/gym/envs/mujoco/half_cheetah_v3.py b/gym/envs/mujoco/half_cheetah_v3.py index 64104f867a8..c6da69f2cc7 100644 --- a/gym/envs/mujoco/half_cheetah_v3.py +++ b/gym/envs/mujoco/half_cheetah_v3.py @@ -10,125 +10,6 @@ class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment is based on the work by P. Wawrzy´nski in - ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). - The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8 - joints connecting them (including two paws). The goal is to apply a torque - on the joints to make the cheetah run forward (right) as fast as possible, - with a positive reward allocated based on the distance moved forward and a - negative reward allocated for moving backward. The torso and head of the - cheetah are fixed, and the torque can only be applied on the other 6 joints - over the front and back thighs (connecting to the torso), shins - (connecting to the thighs) and feet (connecting to the shins). - - ### Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied between *links*. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | - | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | - | 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) | - | 3 | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) | - | 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) | - | 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) | - - ### Observation Space - - Observations consist of positional values of different body parts of the - cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the cheetah's center of mass. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 18 dimensions where the first dimension - represents the x-coordinate of the cheetah's center of mass. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - will be returned in `info` with key `"x_position"`. - - However, by default, the observation is a `ndarray` with shape `(17,)` where the elements correspond to the following: - - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | - | 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) | - | 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) | - | 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | - | 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | - | 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | - | 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | - | 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | - | 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | - | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) | - | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | - | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | - - ### Rewards - The reward consists of two parts: - - *forward_reward*: A reward of moving forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (fixed to 5), where the frametime is 0.01 - making the - default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah - runs forward (right). - - *ctrl_cost*: A cost for penalising the cheetah if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 0.1 - - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ### Starting State - All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the - initial state for stochasticity. As seen before, the first 8 values in the - state are positional and the last 9 values are velocity. A uniform noise in - the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard - normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the - initial velocity values of all zeros. - - ### Episode Termination - The episode terminates when the episode length is greater than 1000. - - ### Arguments - - No additional arguments are currently supported in v2 and lower. - - ``` - env = gym.make('HalfCheetah-v2') - ``` - - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ``` - env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|----------------------|-------------------------------| - | `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.1` | Weight for *ctrl_cost* weight (see section on reward) | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ### Version History - - * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - def __init__( self, xml_file="half_cheetah.xml", @@ -149,7 +30,7 @@ def __init__( exclude_current_positions_from_observation ) - mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py") def control_cost(self, action): control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) diff --git a/gym/envs/mujoco/half_cheetah_v4.py b/gym/envs/mujoco/half_cheetah_v4.py new file mode 100644 index 00000000000..e7454855f4c --- /dev/null +++ b/gym/envs/mujoco/half_cheetah_v4.py @@ -0,0 +1,230 @@ +__credits__ = ["Rushiv Arora"] +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + +DEFAULT_CAMERA_CONFIG = { + "distance": 4.0, +} + + +class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment is based on the work by P. Wawrzy´nski in + ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). + The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8 + joints connecting them (including two paws). The goal is to apply a torque + on the joints to make the cheetah run forward (right) as fast as possible, + with a positive reward allocated based on the distance moved forward and a + negative reward allocated for moving backward. The torso and head of the + cheetah are fixed, and the torque can only be applied on the other 6 joints + over the front and back thighs (connecting to the torso), shins + (connecting to the thighs) and feet (connecting to the shins). + + ### Action Space + The agent take a 6-element vector for actions. + The action space is a continuous `(action, action, action, action, action, action)` all in `[-1.0, 1.0]`, where `action` represents the numerical torques applied between *links* + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-------|--------------------------------------|---------------|----------------|---------------------------------------|-------|------| + | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | + | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | + | 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) | + | 3 | Torque applied on the front thigh rotor| -1 | 1 | fthigh | hinge | torque (N m) | + | 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) | + | 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) | + + ### Observation Space + + The state space consists of positional values of different body parts of the + cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | x-coordinate of the center of mass | -Inf | Inf | rootx | slide | position (m) | + | 1 | y-coordinate of the center of mass | -Inf | Inf | rootz | slide | position (m) | + | 2 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | + | 3 | angle of the back thigh rotor | -Inf | Inf | bthigh | hinge | angle (rad) | + | 4 | angle of the back shin rotor | -Inf | Inf | bshin | hinge | angle (rad) | + | 5 | angle of the back foot rotor | -Inf | Inf | bfoot | hinge | angle (rad) | + | 6 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | + | 7 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | + | 8 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | + | 9 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | + | 10 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | + | 11 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 12 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | + | 13 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | + | 14 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | + | 15 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) | + | 16 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | + | 17 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | + + + **Note:** + In practice (and Gym implementation), the first positional element is + omitted from the state space since the reward function is calculated based + on that value. This value is hidden from the algorithm, which in turn has + to develop an abstract understanding of it from the observed rewards. + Therefore, observation space has shape `(8,)` and looks like: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | + | 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) | + | 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) | + | 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | + | 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | + | 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | + | 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | + | 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | + | 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | + | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge |angular velocity (rad/s) | + | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | + | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | + + ### Rewards + The reward consists of two parts: + - *reward_run*: A reward of moving forward which is measured + as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is + the time between actions and is dependeent on the frame_skip parameter + (default is 5), where the *dt* for one frame is 0.01 - making the + default *dt = 5*0.01 = 0.05*. This reward would be positive if the cheetah + runs forward (right) desired. + - *reward_control*: A negative reward for penalising the cheetah if it takes + actions that are too large. It is measured as *-coefficient x + sum(action2)* where *coefficient* is a parameter set for the + control and has a default value of 0.1 + + The total reward returned is ***reward*** *=* *reward_run + reward_control* + + ### Starting State + All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the + initial state for stochasticity. As seen before, the first 8 values in the + state are positional and the last 9 values are velocity. A uniform noise in + the range of [-0.1, 0.1] is added to the positional values while a standard + normal noise with a mean of 0 and standard deviation of 0.1 is added to the + initial velocity values of all zeros. + + ### Episode Termination + The episode terminates when the episode length is greater than 1000. + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but + modifications can be made to the XML file in the assets folder at + `gym/envs/mujoco/assets/half_cheetah.xml` (or by changing the path to a + modified XML file in another folder). + + ``` + env = gym.make('HalfCheetah-v2') + ``` + + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + ``` + env = gym.make('HalfCheetah-v3', ctrl_cost_weight=0.1, ....) + ``` + + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + def __init__( + self, + xml_file="half_cheetah.xml", + forward_reward_weight=1.0, + ctrl_cost_weight=0.1, + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True, + ): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + done = False + info = { + "x_position": x_position_after, + "x_velocity": x_velocity, + "reward_run": forward_reward, + "reward_ctrl": -ctrl_cost, + } + + return observation, reward, done, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = ( + self.init_qvel + + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv) + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/hopper.py b/gym/envs/mujoco/hopper.py index ad459bda2a2..34c69be720f 100644 --- a/gym/envs/mujoco/hopper.py +++ b/gym/envs/mujoco/hopper.py @@ -6,7 +6,9 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "hopper.xml", 4) + mujoco_env.MujocoEnv.__init__( + self, "hopper.xml", 4, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def step(self, a): diff --git a/gym/envs/mujoco/hopper_v3.py b/gym/envs/mujoco/hopper_v3.py index 807b6976f1b..cd16b07bc91 100644 --- a/gym/envs/mujoco/hopper_v3.py +++ b/gym/envs/mujoco/hopper_v3.py @@ -14,130 +14,6 @@ class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment is based on the work done by Erez, Tassa, and Todorov in - ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to - increase the number of independent state and control variables as compared to - the classic control environments. The hopper is a two-dimensional - one-legged figure that consist of four main body parts - the torso at the - top, the thigh in the middle, the leg in the bottom, and a single foot on - which the entire body rests. The goal is to make hops that move in the - forward (right) direction by applying torques on the three hinges - connecting the four body parts. - - ### Action Space - The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied between *links* - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 3 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - - ### Observation Space - - Observations consist of positional values of different body parts of the - hopper, followed by the velocities of those individual parts - (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the hopper. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 12 dimensions where the first dimension - represents the x-coordinate of the hopper. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - will be returned in `info` with key `"x_position"`. - - However, by default, the observation is a `ndarray` with shape `(11,)` where the elements - correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | - | 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | - | 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - - ### Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. - - *forward_reward*: A reward of hopping forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (fixed to 4), where the frametime is 0.002 - making the - default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper - hops forward (positive x direction). - - *ctrl_cost*: A cost for penalising the hopper if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ### Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise - in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ### Episode Termination - The hopper is said to be unhealthy if any of the following happens: - - 1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range` - 2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen) - 3. The angle (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode terminates when any of the following happens: - - 1. The episode duration reaches a 1000 timesteps - 2. The hopper is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded. - - ### Arguments - - No additional arguments are currently supported in v2 and lower. - - ``` - env = gym.make('Hopper-v2') - ``` - - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ``` - env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|----------------|-------------------------------| - | `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.001` | Weight for *ctrl_cost* reward (see section on reward) | - | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the hopper is no longer healthy | - | `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy | - | `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ### Version History - - * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - def __init__( self, xml_file="hopper.xml", @@ -170,7 +46,7 @@ def __init__( exclude_current_positions_from_observation ) - mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py") @property def healthy_reward(self): diff --git a/gym/envs/mujoco/hopper_v4.py b/gym/envs/mujoco/hopper_v4.py new file mode 100644 index 00000000000..9c80e929320 --- /dev/null +++ b/gym/envs/mujoco/hopper_v4.py @@ -0,0 +1,263 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 3.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + + +class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment is based on the work done by Erez, Tassa, and Todorov in + ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to + increase the number of independent state and control variables as compared to + the classic control environments. The hopper is a two-dimensional + one-legged figure that consist of four main body parts - the torso at the + top, the thigh in the middle, the leg in the bottom, and a single foot on + which the entire body rests. The goal is to make hops that move in the + forward (right) direction by applying torques on the three hinges + connecting the four body parts. + + ### Action Space + The agent take a 3-element vector for actions. + The action space is a continuous `(action, action, action)` all in `[-1, 1]` + , where `action` represents the numerical torques applied between *links* + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 3 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + + ### Observation Space + + The state space consists of positional values of different body parts of the + hopper, followed by the velocities of those individual parts + (their derivatives) with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(11,)` where the elements + correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | x-coordinate of the top | -Inf | Inf | rootx | slide | position (m) | + | 1 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 2 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) | + | 3 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 4 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 5 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 6 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | + | 7 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | + | 8 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 9 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 10 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 11 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + + + + **Note:** + In practice (and Gym implementation), the first positional element is + omitted from the state space since the reward function is calculated based + on that value. This value is hidden from the algorithm, which in turn has + to develop an abstract understanding of it from the observed rewards. + Therefore, observation space has shape `(11,)` instead of `(12,)` and looks like: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | + | 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | + | 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + + ### Rewards + The reward consists of three parts: + - *alive bonus*: Every timestep that the hopper is alive, it gets a reward of 1, + - *reward_forward*: A reward of hopping forward which is measured + as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is + the time between actions and is dependeent on the frame_skip parameter + (default is 4), where the *dt* for one frame is 0.002 - making the + default *dt = 4*0.002 = 0.008*. This reward would be positive if the hopper + hops forward (right) desired. + - *reward_control*: A negative reward for penalising the hopper if it takes + actions that are too large. It is measured as *-coefficient **x** + sum(action2)* where *coefficient* is a parameter set for the + control and has a default value of 0.001 + + The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control* + + ### Starting State + All observations start in state + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise + in the range of [-0.005, 0.005] added to the values for stochasticity. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 1000 timesteps + 2. Any of the state space values is no longer finite + 3. The absolute value of any of the state variable indexed (angle and beyond) is greater than 100 + 4. The height of the hopper becomes greater than 0.7 metres (hopper has hopped too high). + 5. The absolute value of the angle (index 2) is less than 0.2 radians (hopper has fallen down). + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but + modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder). + + ``` + env = gym.make('Hopper-v2') + ``` + + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + ``` + env = gym.make('Hopper-v3', ctrl_cost_weight=0.1, ....) + ``` + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + def __init__( + self, + xml_file="hopper.xml", + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.7, float("inf")), + healthy_angle_range=(-0.2, 0.2), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True, + ): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_state_range = healthy_state_range + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + + @property + def healthy_reward(self): + return ( + float(self.is_healthy or self._terminate_when_unhealthy) + * self._healthy_reward + ) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.data.qpos[1:3] + state = self.state_vector()[2:] + + min_state, max_state = self._healthy_state_range + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_state = np.all(np.logical_and(min_state < state, state < max_state)) + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + + is_healthy = all((healthy_state, healthy_z, healthy_angle)) + + return is_healthy + + @property + def done(self): + done = not self.is_healthy if self._terminate_when_unhealthy else False + return done + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = np.clip(self.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + done = self.done + info = { + "x_position": x_position_after, + "x_velocity": x_velocity, + } + + return observation, reward, done, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/humanoid.py b/gym/envs/mujoco/humanoid.py index d025419937e..d69c08eeb99 100644 --- a/gym/envs/mujoco/humanoid.py +++ b/gym/envs/mujoco/humanoid.py @@ -12,7 +12,9 @@ def mass_center(model, sim): class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "humanoid.xml", 5) + mujoco_env.MujocoEnv.__init__( + self, "humanoid.xml", 5, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def _get_obs(self): diff --git a/gym/envs/mujoco/humanoid_v3.py b/gym/envs/mujoco/humanoid_v3.py index a17887b0ff6..b1daaeaf711 100644 --- a/gym/envs/mujoco/humanoid_v3.py +++ b/gym/envs/mujoco/humanoid_v3.py @@ -18,198 +18,6 @@ def mass_center(model, sim): class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). - The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of - legs and arms. The legs each consist of two links, and so the arms (representing the knees and - elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. - - ### Action Space - The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - - ### Observation Space - - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 378 dimensions where the first two dimensions - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, the observation is a `ndarray` with shape `(376,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | - | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | - | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | - | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | - | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | - | 19 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | - | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | - | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | - | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | - | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | - | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | - | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | - | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | - | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | - | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | - | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | - | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | - | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) - and hence adds to another 140 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence - adds another 84 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the - [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 - results in the contact forces always being 0. As such we recommend to use a Mujoco-Py - version < 2.0 when using the Humanoid environment if you would like to report results - with contact forces (if contact forces are not used in your experiments, you can use - version > 2.0). - - ### Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward` - - *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` * - (average center of mass before action - average center of mass after action)/dt*. - *dt* is the time between actions and is dependent on the frame_skip parameter - (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. - This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation - for the center of mass is defined in the `.py` file for the Humanoid. - - *ctrl_cost*: A negative reward for penalising the humanoid if it has too - large of a control force. If there are *nu* actuators/controls, then the control has - shape `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control2)*. - - *contact_cost*: A negative reward for penalising the humanoid if the external - contact force is too large. It is calculated by clipping - *`contact_cost_weight` * sum(external contact force2)* to the interval specified by `contact_cost_range`. - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms - - ### Starting State - All observations start in state - (0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally - selected to be high, thereby indicating a standing up humanoid. The initial - orientation is designed to make it face forward as well. - - ### Episode Termination - The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the - closed interval specified by the argument `healthy_z_range`. - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode terminates when any of the following happens: - - 1. The episode duration reaches a 1000 timesteps - 3. The humanoid is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded. - - - ### Arguments - - No additional arguments are currently supported in v2 and lower. - - ``` - env = gym.make('Humanoid-v2') - ``` - - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ``` - env = gym.make('Humanoid-v3', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|-------------------|-------------------------------| - | `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.25` | Weight for *forward_reward* term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.1` | Weight for *ctrl_cost* term (see section on reward) | - | `contact_cost_weight` | **float** | `5e-7` | Weight for *contact_cost* term (see section on reward) | - | `healthy_reward` | **float** | `5.0` | Constant reward given if the humanoid is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | - | `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range | - | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ### Version History - - * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - def __init__( self, xml_file="humanoid.xml", @@ -239,7 +47,7 @@ def __init__( exclude_current_positions_from_observation ) - mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + mujoco_env.MujocoEnv.__init__(self, xml_file, 5, mujoco_bindings="mujoco_py") @property def healthy_reward(self): diff --git a/gym/envs/mujoco/humanoid_v4.py b/gym/envs/mujoco/humanoid_v4.py new file mode 100644 index 00000000000..d2a0bf4ecd9 --- /dev/null +++ b/gym/envs/mujoco/humanoid_v4.py @@ -0,0 +1,331 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 1, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 2.0)), + "elevation": -20.0, +} + + +def mass_center(model, data): + mass = np.expand_dims(model.body_mass, axis=1) + xpos = data.xipos + return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() + + +class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment is based on the environment introduced by Tassa, Erez and Todorov + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of + legs and arms. The legs each consist of two links, and so the arms (representing the knees and + elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. + + ### Action Space + The agent take a 17-element vector for actions. + The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` + represents the numerical torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + + ### Observation Space + + The state space consists of positional values of different body parts of the Humanoid, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| + | 0 | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 2 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 3 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 6 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 7 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | + | 8 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | + | 9 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | + | 10 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | + | 11 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | + | 12 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | + | 13 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | + | 14 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | + | 15 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | + | 16 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | + | 17 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | + | 18 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | + | 19 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | + | 20 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | + | 21 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | + | 22 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | + | 23 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | + | 24 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 25 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 26 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 27 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 28 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 29 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 30 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | + | 31 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | + | 32 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | + | 33 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | + | 34 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | + | 35 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | + | 36 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | + | 37 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | + | 38 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | + | 39 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | + | 40 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | + | 41 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | + | 42 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | + | 43 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | + | 44 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | + | 45 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | + | 46 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + + Additionally, after all the positional and velocity based values in the table, + the state_space consists of (in order): + - *cinert:* Mass and inertia of a single rigid body relative to the center of mass + (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) + and hence adds to another 140 elements in the state space. + - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence + adds another 84 elements in the state space + - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape + `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. + - *cfrc_ext:* This is the center of mass based external force on the body. It has shape + 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. + where *nbody* stands for the number of bodies in the robot and *nv* stands for the + number of degrees of freedom (*= dim(qvel)*) + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the + [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** There are 47 elements in the table above - giving rise to `(378,)` + elements in the state space. In practice (and Gym implementation), the first two + positional elements are omitted from the state space since the reward function is + calculated based on the x-coordinate value. This value is hidden from the algorithm, + which in turn has to develop an abstract understanding of it from the observed rewards. + Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should + not have the first two rows. + + **Note:** Humanoid-v4 environment no longer has the following contact forces issue. + If using previous Humanoid versions from v4, there have been reported issues that using + a Mujoco-Py version > 2.0 results in the contact forces always being 0. As such we recommend + to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report + results with contact forces (if contact forces are not used in your experiments, you can use + version > 2.0). + + ### Rewards + The reward consists of three parts: + - *alive_bonus*: Every timestep that the humanoid is alive, it gets a reward of 5. + - *lin_vel_cost*: A reward of walking forward which is measured as *1.25 * + (average center of mass before action - average center of mass after action)/dt*. + *dt* is the time between actions and is dependent on the frame_skip parameter + (default is 5), where the *dt* for one frame is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. + This reward would be positive if the humanoid walks forward (right) desired. The calculation + for the center of mass is defined in the `.py` file for the Humanoid. + - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too + large of a control force. If there are *nu* actuators/controls, then the control has + shape `nu x 1`. It is measured as *0.1 **x** sum(control2)*. + - *quad_impact_cost*: A negative reward for penalising the humanoid if the external + contact force is too large. It is calculated as + *min(0.5 * 0.000001 * sum(external contact force2), 10)*. + + The total reward returned is ***reward*** *=* *alive_bonus + lin_vel_cost - quad_ctrl_cost - quad_impact_cost* + + ### Starting State + All observations start in state + (0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range + of [-0.01, 0.01] added to the positional and velocity values (values in the table) + for stochasticity. Note that the initial z coordinate is intentionally + selected to be high, thereby indicating a standing up humanoid. The initial + orientation is designed to make it face forward as well. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 1000 timesteps + 2. Any of the state space values is no longer finite + 3. The z-coordinate of the torso (index 0 in state space OR index 2 in the table) is **not** in the range `[1.0, 2.0]` (the humanoid has fallen or is about to fall beyond recovery). + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but + modifications can be made to the XML file in the assets folder (or by changing + the path to a modified XML file in another folder).. + + ``` + env = gym.make('Ant-v2') + ``` + + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + ``` + env = gym.make('Ant-v3', ctrl_cost_weight=0.1, ....) + ``` + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + + """ + + def __init__( + self, + xml_file="humanoid.xml", + forward_reward_weight=1.25, + ctrl_cost_weight=0.1, + healthy_reward=5.0, + terminate_when_unhealthy=True, + healthy_z_range=(1.0, 2.0), + reset_noise_scale=1e-2, + exclude_current_positions_from_observation=True, + ): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + @property + def healthy_reward(self): + return ( + float(self.is_healthy or self._terminate_when_unhealthy) + * self._healthy_reward + ) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl)) + return control_cost + + @property + def is_healthy(self): + min_z, max_z = self._healthy_z_range + is_healthy = min_z < self.data.qpos[2] < max_z + + return is_healthy + + @property + def done(self): + done = (not self.is_healthy) if self._terminate_when_unhealthy else False + return done + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + com_inertia = self.data.cinert.flat.copy() + com_velocity = self.data.cvel.flat.copy() + + actuator_forces = self.data.qfrc_actuator.flat.copy() + external_contact_forces = self.data.cfrc_ext.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + return np.concatenate( + ( + position, + velocity, + com_inertia, + com_velocity, + actuator_forces, + external_contact_forces, + ) + ) + + def step(self, action): + xy_position_before = mass_center(self.model, self.data) + self.do_simulation(action, self.frame_skip) + xy_position_after = mass_center(self.model, self.data) + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + + observation = self._get_obs() + reward = rewards - ctrl_cost + done = self.done + info = { + "reward_linvel": forward_reward, + "reward_quadctrl": -ctrl_cost, + "reward_alive": healthy_reward, + "x_position": xy_position_after[0], + "y_position": xy_position_after[1], + "distance_from_origin": np.linalg.norm(xy_position_after, ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + "forward_reward": forward_reward, + } + + return observation, reward, done, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/humanoidstandup.py b/gym/envs/mujoco/humanoidstandup.py index d1a6b47427d..217917b9b09 100644 --- a/gym/envs/mujoco/humanoidstandup.py +++ b/gym/envs/mujoco/humanoidstandup.py @@ -5,178 +5,10 @@ class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). - The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a - pair of legs and arms. The legs each consist of two links, and so the arms (representing the - knees and elbows respectively). The environment starts with the humanoid layiing on the ground, - and then the goal of the environment is to make the humanoid standup and then keep it standing - by applying torques on the various hinges. - - ### Action Space - The agent take a 17-element vector for actions. - - The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` - represents the numerical torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - - ### Observation Space - - The state space consists of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - **Note:** The x- and y-coordinates of the torso are being omitted to produce position-agnostic behavior in policies - - The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | - | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | - | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | - | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | - | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | - | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | - | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | - | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | - | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | - | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | - | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | - | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | - | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | - | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | - | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | - | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | - | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | - | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 35 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 36 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 37 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 38 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 39 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 40 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 41 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 42 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 43 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 44 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 45 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - - - Additionally, after all the positional and velocity based values in the table, - the state_space consists of (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) - and hence adds to another 140 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence - adds another 84 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the number - of degrees of freedom (*= dim(qvel)*) - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the - [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - **Note:** There have been reported issues that using a Mujoco-Py version > 2.0 results - in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 - when using the Humanoid environment if you would like to report results with contact forces - (if contact forces are not used in your experiments, you can use version > 2.0). - - ### Rewards - The reward consists of three parts: - - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative - reward which meaures how much upward it has moved from the last timestep, but it is an - absolute reward which measures how much upward the Humanoid has moved overall. It is - measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after - action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for - one frame of movement even though the simulation has a framerate of 5 (done in order to inflate - rewards a little for fassteer learning). - - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of - a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`. - It is measured as *0.1 **x** sum(control2)*. - - *quad_impact_cost*: A negative reward for penalising the humanoid if the external - contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external - contact force2), 10)*. - - The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost* - - ### Starting State - All observations start in state - (0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of - [-0.01, 0.01] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally selected - to be low, thereby indicating a laying down humanoid. The initial orientation is - designed to make it face forward as well. - - ### Episode Termination - The episode terminates when any of the following happens: - - 1. The episode duration reaches a 1000 timesteps - 2. Any of the state space values is no longer finite - - ### Arguments - - No additional arguments are currently supported. - - ``` - env = gym.make('HumanoidStandup-v2') - ``` - - There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and - beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - - ### Version History - - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - - """ - def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5) + mujoco_env.MujocoEnv.__init__( + self, "humanoidstandup.xml", 5, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def _get_obs(self): diff --git a/gym/envs/mujoco/humanoidstandup_v4.py b/gym/envs/mujoco/humanoidstandup_v4.py new file mode 100644 index 00000000000..3d28dec9f7a --- /dev/null +++ b/gym/envs/mujoco/humanoidstandup_v4.py @@ -0,0 +1,247 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + + +class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment is based on the environment introduced by Tassa, Erez and Todorov + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a + pair of legs and arms. The legs each consist of two links, and so the arms (representing the + knees and elbows respectively). The environment starts with the humanoid layiing on the ground, + and then the goal of the environment is to make the humanoid standup and then keep it standing + by applying torques on the various hinges. + + ### Action Space + The agent take a 17-element vector for actions. + + The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` + represents the numerical torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | hip_1 (front_left_leg) | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | angle_1 (front_left_leg) | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | hip_2 (front_right_leg) | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + + ### Observation Space + + The state space consists of positional values of different body parts of the Humanoid, + followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| + | 0 | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 2 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 3 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 6 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 7 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | + | 8 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | + | 9 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | + | 10 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | + | 11 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | + | 12 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | + | 13 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | + | 14 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | + | 15 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | + | 16 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | + | 17 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | + | 18 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | + | 19 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | + | 20 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | + | 21 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | + | 22 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | + | 23 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | + | 24 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 25 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 26 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 27 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 28 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 29 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 30 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | + | 31 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | + | 32 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | + | 33 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | + | 34 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | + | 35 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | + | 36 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | + | 37 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | + | 38 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | + | 39 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | + | 40 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | + | 41 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | + | 42 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | + | 43 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | + | 44 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | + | 45 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | + | 46 | angular velocitty of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + + + Additionally, after all the positional and velocity based values in the table, + the state_space consists of (in order): + - *cinert:* Mass and inertia of a single rigid body relative to the center of mass + (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) + and hence adds to another 140 elements in the state space. + - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence + adds another 84 elements in the state space + - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape + `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. + - *cfrc_ext:* This is the center of mass based external force on the body. It has shape + 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. + where *nbody* stands for the number of bodies in the robot and *nv* stands for the number + of degrees of freedom (*= dim(qvel)*) + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the + [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** There are 47 elements in the table above - giving rise to `(378,)` + elements in the state space. In practice (and Gym implementation), the first two + positional elements are omitted from the state space since the reward function is + calculated based on the x-coordinate value. This value is hidden from the algorithm, + which in turn has to develop an abstract understanding of it from the observed rewards. + Therefore, observation space has shape `(376,)` instead of `(378,)` and the table should not have the first two rows. + + **Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue. + If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results + in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 + when using the Humanoid environment if you would like to report results with contact forces + (if contact forces are not used in your experiments, you can use version > 2.0). + + ### Rewards + The reward consists of three parts: + - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative + reward which meaures how much upward it has moved from the last timestep, but it is an + absolute reward which measures how much upward the Humanoid has moved overall. It is + measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after + action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for + one frame of movement even though the simulation has a framerate of 5 (done in order to inflate + rewards a little for fassteer learning). + - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of + a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`. + It is measured as *0.1 **x** sum(control2)*. + - *quad_impact_cost*: A negative reward for penalising the humanoid if the external + contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external + contact force2), 10)*. + + The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost* + + ### Starting State + All observations start in state + (0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of + [-0.01, 0.01] added to the positional and velocity values (values in the table) + for stochasticity. Note that the initial z coordinate is intentionally selected + to be low, thereby indicating a laying down humanoid. The initial orientation is + designed to make it face forward as well. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 1000 timesteps + 2. Any of the state space values is no longer finite + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but + modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder).. + + ``` + env = gym.make('HumanoidStandup-v2') + ``` + + There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and + beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + + """ + + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, "humanoidstandup.xml", 5) + utils.EzPickle.__init__(self) + + def _get_obs(self): + data = self.data + return np.concatenate( + [ + data.qpos.flat[2:], + data.qvel.flat, + data.cinert.flat, + data.cvel.flat, + data.qfrc_actuator.flat, + data.cfrc_ext.flat, + ] + ) + + def step(self, a): + self.do_simulation(a, self.frame_skip) + pos_after = self.data.qpos[2] + data = self.data + uph_cost = (pos_after - 0) / self.model.opt.timestep + + quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() + quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum() + quad_impact_cost = min(quad_impact_cost, 10) + reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1 + + done = bool(False) + return ( + self._get_obs(), + reward, + done, + dict( + reward_linup=uph_cost, + reward_quadctrl=-quad_ctrl_cost, + reward_impact=-quad_impact_cost, + ), + ) + + def reset_model(self): + c = 0.01 + self.set_state( + self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), + self.init_qvel + + self.np_random.uniform( + low=-c, + high=c, + size=self.model.nv, + ), + ) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 1 + self.viewer.cam.distance = self.model.stat.extent * 1.0 + self.viewer.cam.lookat[2] = 0.8925 + self.viewer.cam.elevation = -20 diff --git a/gym/envs/mujoco/inverted_double_pendulum.py b/gym/envs/mujoco/inverted_double_pendulum.py index f523d170e68..2f241c02b43 100644 --- a/gym/envs/mujoco/inverted_double_pendulum.py +++ b/gym/envs/mujoco/inverted_double_pendulum.py @@ -5,113 +5,10 @@ class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment originates from control theory and builds on the cartpole - environment based on the work done by Barto, Sutton, and Anderson in - ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), - powered by the Mujoco physics simulator - allowing for more complex experiments - (such as varying the effects of gravity or constraints). This environment involves a cart that can - moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one - (leaving the second pole as the only one with one free end). The cart can be pushed left or right, - and the goal is to balance the second pole on top of the first pole, which is in turn on top of the - cart, by applying continuous forces on the cart. - - ### Action Space - The agent take a 1-element vector for actions. - The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the - numerical force applied to the cart (with magnitude representing the amount of force and - sign representing the direction) - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| - | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | - - ### Observation Space - - The state space consists of positional values of different body parts of the pendulum system, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | - | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | - | 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless | - | 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless | - | 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless | - | 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | - | 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) | - | 8 | constraint force - 1 | -Inf | Inf | | | Force (N) | - | 9 | constraint force - 2 | -Inf | Inf | | | Force (N) | - | 10 | constraint force - 3 | -Inf | Inf | | | Force (N) | - - - There is physical contact between the robots and their environment - and Mujoco - attempts at getting realisitic physics simulations for the possible physical contact - dynamics by aiming for physical accuracy and computational efficiency. - - There is one constraint force for contacts for each degree of freedom (3). - The approach and handling of constraints by Mujoco is unique to the simulator - and is based on their research. Once can find more information in their - [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html) - or in their paper - ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf). - - - ### Rewards - - The reward consists of two parts: - - *alive_bonus*: The goal is to make the second inverted pendulum stand upright - (within a certain angle limit) as long as possible - as such a reward of +10 is awarded - for each timestep that the second pole is upright. - - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum - (the only free end) moves, and it is calculated as - *0.01 * x2 + (y - 2)2*, where *x* is the x-coordinate of the tip - and *y* is the y-coordinate of the tip of the second pole. - - *velocity_penalty*: A negative reward for penalising the agent if it moves too - fast *0.001 * v12 + 0.005 * v2 2* - - The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty* - - ### Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard - normal force with a standard deviation of 0.1 added to the velocity values for stochasticity. - - ### Episode Termination - The episode terminates when any of the following happens: - - 1. The episode duration reaches 1000 timesteps. - 2. Any of the state space values is no longer finite. - 3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other). - - ### Arguments - - No additional arguments are currently supported. - - ``` - env = gym.make('InvertedDoublePendulum-v2') - ``` - There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and - beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - - ### Version History - - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) - * v0: Initial versions release (1.0.0) - - """ - def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5) + mujoco_env.MujocoEnv.__init__( + self, "inverted_double_pendulum.xml", 5, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def step(self, action): diff --git a/gym/envs/mujoco/inverted_double_pendulum_v4.py b/gym/envs/mujoco/inverted_double_pendulum_v4.py new file mode 100644 index 00000000000..5c1d162ad0e --- /dev/null +++ b/gym/envs/mujoco/inverted_double_pendulum_v4.py @@ -0,0 +1,156 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + + +class InvertedDoublePendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment originates from control theory and builds on the cartpole + environment based on the work done by Barto, Sutton, and Anderson in + ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), + powered by the Mujoco physics simulator - allowing for more complex experiments + (such as varying the effects of gravity or constraints). This environment involves a cart that can + moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one + (leaving the second pole as the only one with one free end). The cart can be pushed left or right, + and the goal is to balance the second pole on top of the first pole, which is in turn on top of the + cart, by applying continuous forces on the cart. + + ### Action Space + The agent take a 1-element vector for actions. + The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the + numerical force applied to the cart (with magnitude representing the amount of force and + sign representing the direction) + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| + | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | + + ### Observation Space + + The state space consists of positional values of different body parts of the pendulum system, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | + | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | + | 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless | + | 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless | + | 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless | + | 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | + | 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | + | 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) | + | 8 | constraint force - 1 | -Inf | Inf | | | Force (N) | + | 9 | constraint force - 2 | -Inf | Inf | | | Force (N) | + | 10 | constraint force - 3 | -Inf | Inf | | | Force (N) | + + + There is physical contact between the robots and their environment - and Mujoco + attempts at getting realisitic physics simulations for the possible physical contact + dynamics by aiming for physical accuracy and computational efficiency. + + There is one constraint force for contacts for each degree of freedom (3). + The approach and handling of constraints by Mujoco is unique to the simulator + and is based on their research. Once can find more information in their + [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html) + or in their paper + ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf). + + + ### Rewards + + The reward consists of two parts: + - *alive_bonus*: The goal is to make the second inverted pendulum stand upright + (within a certain angle limit) as long as possible - as such a reward of +10 is awarded + for each timestep that the second pole is upright. + - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum + (the only free end) moves, and it is calculated as + *0.01 * x2 + (y - 2)2*, where *x* is the x-coordinate of the tip + and *y* is the y-coordinate of the tip of the second pole. + - *velocity_penalty*: A negative reward for penalising the agent if it moves too + fast *0.001 * v12 + 0.005 * v2 2* + + The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty* + + ### Starting State + All observations start in state + (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range + of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard + normal force with a standard deviation of 0.1 added to the velocity values for stochasticity. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches 1000 timesteps. + 2. Any of the state space values is no longer finite. + 3. The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other). + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but modifications can + be made to the XML file in the assets folder (or by changing the path to a modified XML + file in another folder).. + + ``` + env = gym.make('InvertedDoublePendulum-v2') + ``` + There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and + beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) + * v0: Initial versions release (1.0.0) + + """ + + def __init__(self): + mujoco_env.MujocoEnv.__init__(self, "inverted_double_pendulum.xml", 5) + utils.EzPickle.__init__(self) + + def step(self, action): + self.do_simulation(action, self.frame_skip) + ob = self._get_obs() + x, _, y = self.data.site_xpos[0] + dist_penalty = 0.01 * x**2 + (y - 2) ** 2 + v1, v2 = self.data.qvel[1:3] + vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 + alive_bonus = 10 + r = alive_bonus - dist_penalty - vel_penalty + done = bool(y <= 1) + return ob, r, done, {} + + def _get_obs(self): + return np.concatenate( + [ + self.data.qpos[:1], # cart x pos + np.sin(self.data.qpos[1:]), # link angles + np.cos(self.data.qpos[1:]), + np.clip(self.data.qvel, -10, 10), + np.clip(self.data.qfrc_constraint, -10, 10), + ] + ).ravel() + + def reset_model(self): + self.set_state( + self.init_qpos + + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq), + self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1, + ) + return self._get_obs() + + def viewer_setup(self): + v = self.viewer + v.cam.trackbodyid = 0 + v.cam.distance = self.model.stat.extent * 0.5 + v.cam.lookat[2] = 0.12250000000000005 # v.model.stat.center[2] diff --git a/gym/envs/mujoco/inverted_pendulum.py b/gym/envs/mujoco/inverted_pendulum.py index 46472cf43a9..342404b5e91 100644 --- a/gym/envs/mujoco/inverted_pendulum.py +++ b/gym/envs/mujoco/inverted_pendulum.py @@ -5,85 +5,11 @@ class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment is the cartpole environment based on the work done by - Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can - solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), - just like in the classic environments but now powered by the Mujoco physics simulator - - allowing for more complex experiments (such as varying the effects of gravity). - This environment involves a cart that can moved linearly, with a pole fixed on it - at one end and having another end free. The cart can be pushed left or right, and the - goal is to balance the pole on the top of the cart by applying forces on the cart. - - ### Action Space - The agent take a 1-element vector for actions. - - The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents - the numerical force applied to the cart (with magnitude representing the amount of - force and sign representing the direction) - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| - | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | - - ### Observation Space - - The state space consists of positional values of different body parts of - the pendulum system, followed by the velocities of those individual parts (their derivatives) - with all the positions ordered before all the velocities. - - The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | - | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | - | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) | - - - ### Rewards - - The goal is to make the inverted pendulum stand upright (within a certain angle limit) - as long as possible - as such a reward of +1 is awarded for each timestep that - the pole is upright. - - ### Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of [-0.01, 0.01] added to the values for stochasticity. - - ### Episode Termination - The episode terminates when any of the following happens: - - 1. The episode duration reaches 1000 timesteps. - 2. Any of the state space values is no longer finite. - 3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian. - - ### Arguments - - No additional arguments are currently supported. - - ``` - env = gym.make('InvertedPendulum-v2') - ``` - There is no v3 for InvertedPendulum, unlike the robot environments where a - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - - ### Version History - - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) - * v0: Initial versions release (1.0.0) - - """ - def __init__(self): utils.EzPickle.__init__(self) - mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2) + mujoco_env.MujocoEnv.__init__( + self, "inverted_pendulum.xml", 2, mujoco_bindings="mujoco_py" + ) def step(self, a): reward = 1.0 diff --git a/gym/envs/mujoco/inverted_pendulum_v4.py b/gym/envs/mujoco/inverted_pendulum_v4.py new file mode 100644 index 00000000000..7d3fa72200e --- /dev/null +++ b/gym/envs/mujoco/inverted_pendulum_v4.py @@ -0,0 +1,116 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + + +class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment is the cartpole environment based on the work done by + Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can + solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), + just like in the classic environments but now powered by the Mujoco physics simulator - + allowing for more complex experiments (such as varying the effects of gravity). + This environment involves a cart that can moved linearly, with a pole fixed on it + at one end and having another end free. The cart can be pushed left or right, and the + goal is to balance the pole on the top of the cart by applying forces on the cart. + + ### Action Space + The agent take a 1-element vector for actions. + + The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents + the numerical force applied to the cart (with magnitude representing the amount of + force and sign representing the direction) + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| + | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | + + ### Observation Space + + The state space consists of positional values of different body parts of + the pendulum system, followed by the velocities of those individual parts (their derivatives) + with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | + | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | + | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | + | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) | + + + ### Rewards + + The goal is to make the inverted pendulum stand upright (within a certain angle limit) + as long as possible - as such a reward of +1 is awarded for each timestep that + the pole is upright. + + ### Starting State + All observations start in state + (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range + of [-0.01, 0.01] added to the values for stochasticity. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches 1000 timesteps. + 2. Any of the state space values is no longer finite. + 3. The absolutely value of the vertical angle between the pole and the cart is greater than 0.2 radian. + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), + but modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder). + + ``` + env = gym.make('InvertedPendulum-v2') + ``` + There is no v3 for InvertedPendulum, unlike the robot environments where a + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) + * v0: Initial versions release (1.0.0) + + """ + + def __init__(self): + utils.EzPickle.__init__(self) + mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml", 2) + + def step(self, a): + reward = 1.0 + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2) + done = not notdone + return ob, reward, done, {} + + def reset_model(self): + qpos = self.init_qpos + self.np_random.uniform( + size=self.model.nq, low=-0.01, high=0.01 + ) + qvel = self.init_qvel + self.np_random.uniform( + size=self.model.nv, low=-0.01, high=0.01 + ) + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([self.data.qpos, self.data.qvel]).ravel() + + def viewer_setup(self): + v = self.viewer + v.cam.trackbodyid = 0 + v.cam.distance = self.model.stat.extent diff --git a/gym/envs/mujoco/mujoco_env.py b/gym/envs/mujoco/mujoco_env.py index 55c3884ac87..7bf3b260439 100644 --- a/gym/envs/mujoco/mujoco_env.py +++ b/gym/envs/mujoco/mujoco_env.py @@ -1,4 +1,3 @@ -import os from collections import OrderedDict from os import path from typing import Optional @@ -6,18 +5,9 @@ import numpy as np import gym -from gym import error, spaces - -try: - import mujoco_py -except ImportError as e: - raise error.DependencyNotInstalled( - "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format( - e - ) - ) +from gym import error, logger, spaces -DEFAULT_SIZE = 500 +DEFAULT_SIZE = 480 def convert_observation_to_space(observation): @@ -43,28 +33,64 @@ def convert_observation_to_space(observation): class MujocoEnv(gym.Env): """Superclass for all MuJoCo environments.""" - def __init__(self, model_path, frame_skip): + def __init__(self, model_path, frame_skip, mujoco_bindings="mujoco"): + if model_path.startswith("/"): fullpath = model_path else: - fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path) + fullpath = path.join(path.dirname(__file__), "assets", model_path) if not path.exists(fullpath): raise OSError(f"File {fullpath} does not exist") + + if mujoco_bindings == "mujoco_py": + logger.warn( + "This version of the mujoco environments depends " + "on the mujoco-py bindings, which are no longer maintained " + "and may stop working. Please upgrade to the v4 versions of " + "the environments (which depend on the mujoco python bindings instead), unless " + "you are trying to precisely replicate previous works)." + ) + try: + import mujoco_py + + self._mujoco_bindings = mujoco_py + except ImportError as e: + raise error.DependencyNotInstalled( + "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format( + e + ) + ) + + self.model = self._mujoco_bindings.load_model_from_path(fullpath) + self.sim = self._mujoco_bindings.MjSim(self.model) + self.data = self.sim.data + + elif mujoco_bindings == "mujoco": + try: + import mujoco + + self._mujoco_bindings = mujoco + + except ImportError as e: + raise error.DependencyNotInstalled( + f"{e}. (HINT: you need to install mujoco)" + ) + self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath) + self.data = self._mujoco_bindings.MjData(self.model) + + self.init_qpos = self.data.qpos.ravel().copy() + self.init_qvel = self.data.qvel.ravel().copy() + self._viewers = {} + self.frame_skip = frame_skip - self.model = mujoco_py.load_model_from_path(fullpath) - self.sim = mujoco_py.MjSim(self.model) - self.data = self.sim.data + self.viewer = None - self._viewers = {} self.metadata = { "render_modes": ["human", "rgb_array", "depth_array"], "render_fps": int(np.round(1.0 / self.dt)), } - self.init_qpos = self.sim.data.qpos.ravel().copy() - self.init_qvel = self.sim.data.qvel.ravel().copy() - self._set_action_space() action = self.action_space.sample() @@ -111,7 +137,12 @@ def reset( options: Optional[dict] = None, ): super().reset(seed=seed) - self.sim.reset() + + if self._mujoco_bindings.__name__ == "mujoco_py": + self.sim.reset() + else: + self._mujoco_bindings.mj_resetData(self.model, self.data) + ob = self.reset_model() if not return_info: return ob @@ -120,12 +151,19 @@ def reset( def set_state(self, qpos, qvel): assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) - old_state = self.sim.get_state() - new_state = mujoco_py.MjSimState( - old_state.time, qpos, qvel, old_state.act, old_state.udd_state - ) - self.sim.set_state(new_state) - self.sim.forward() + if self._mujoco_bindings.__name__ == "mujoco_py": + state = self.sim.get_state() + state = self._mujoco_bindings.MjSimState( + state.time, qpos, qvel, state.act, state.udd_state + ) + self.sim.set_state(state) + self.sim.forward() + else: + self.data.qpos[:] = np.copy(qpos) + self.data.qvel[:] = np.copy(qvel) + if self.model.na == 0: + self.data.act[:] = None + self._mujoco_bindings.mj_forward(self.model, self.data) @property def dt(self): @@ -135,9 +173,21 @@ def do_simulation(self, ctrl, n_frames): if np.array(ctrl).shape != self.action_space.shape: raise ValueError("Action dimension mismatch") - self.sim.data.ctrl[:] = ctrl + if self._mujoco_bindings.__name__ == "mujoco_py": + self.sim.data.ctrl[:] = ctrl + else: + self.data.ctrl[:] = ctrl for _ in range(n_frames): - self.sim.step() + if self._mujoco_bindings.__name__ == "mujoco_py": + self.sim.step() + else: + self._mujoco_bindings.mj_step(self.model, self.data) + + # As of MuJoCo 2.0, force-related quantities like cacc are not computed + # unless there's a force sensor in the model. + # See https://github.com/openai/gym/issues/1541 + if self._mujoco_bindings.__name__ != "mujoco_py": + self._mujoco_bindings.mj_rnePostConstraint(self.model, self.data) def render( self, @@ -158,19 +208,25 @@ def render( if no_camera_specified: camera_name = "track" - if camera_id is None and camera_name in self.model._camera_name2id: - camera_id = self.model.camera_name2id(camera_name) + if camera_id is None: + if self._mujoco_bindings.__name__ == "mujoco_py": + if camera_name in self.model._camera_name2id: + camera_id = self.model.camera_name2id(camera_name) + else: + camera_id = self._mujoco_bindings.mj_name2id( + self.model, + self._mujoco_bindings.mjtObj.mjOBJ_CAMERA, + camera_name, + ) - self._get_viewer(mode).render(width, height, camera_id=camera_id) + self._get_viewer(mode).render(width, height, camera_id=camera_id) if mode == "rgb_array": - # window size used for old mujoco-py: data = self._get_viewer(mode).read_pixels(width, height, depth=False) # original image is upside-down, so flip it return data[::-1, :, :] elif mode == "depth_array": self._get_viewer(mode).render(width, height) - # window size used for old mujoco-py: # Extract depth part of the read_pixels() tuple data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1] # original image is upside-down, so flip it @@ -180,24 +236,40 @@ def render( def close(self): if self.viewer is not None: - # self.viewer.finish() self.viewer = None self._viewers = {} - def _get_viewer(self, mode): + def _get_viewer(self, mode, width=DEFAULT_SIZE, height=DEFAULT_SIZE): self.viewer = self._viewers.get(mode) if self.viewer is None: if mode == "human": - self.viewer = mujoco_py.MjViewer(self.sim) + if self._mujoco_bindings.__name__ == "mujoco_py": + self.viewer = self._mujoco_bindings.MjViewer(self.sim) + else: + from gym.envs.mujoco.mujoco_rendering import Viewer + + self.viewer = Viewer(self.model, self.data) elif mode == "rgb_array" or mode == "depth_array": - self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1) + if self._mujoco_bindings.__name__ == "mujoco_py": + self.viewer = self._mujoco_bindings.MjRenderContextOffscreen( + self.sim, -1 + ) + else: + from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen + + self.viewer = RenderContextOffscreen( + width, height, self.model, self.data + ) self.viewer_setup() self._viewers[mode] = self.viewer return self.viewer def get_body_com(self, body_name): - return self.data.get_body_xpos(body_name) + if self._mujoco_bindings.__name__ == "mujoco_py": + return self.data.get_body_xpos(body_name) + else: + return self.data.body(body_name).xpos def state_vector(self): - return np.concatenate([self.sim.data.qpos.flat, self.sim.data.qvel.flat]) + return np.concatenate([self.data.qpos.flat, self.data.qvel.flat]) diff --git a/gym/envs/mujoco/mujoco_rendering.py b/gym/envs/mujoco/mujoco_rendering.py new file mode 100644 index 00000000000..c49bf0b1760 --- /dev/null +++ b/gym/envs/mujoco/mujoco_rendering.py @@ -0,0 +1,554 @@ +import collections +import os +import sys +import time +from threading import Lock + +import glfw +import imageio +import mujoco +import numpy as np + + +def _import_egl(width, height): + from mujoco.egl import GLContext + + return GLContext(width, height) + + +def _import_glfw(width, height): + from mujoco.glfw import GLContext + + return GLContext(width, height) + + +def _import_osmesa(width, height): + from mujoco.osmesa import GLContext + + return GLContext(width, height) + + +_ALL_RENDERERS = collections.OrderedDict( + [ + ("glfw", _import_glfw), + ("egl", _import_egl), + ("osmesa", _import_osmesa), + ] +) + + +class RenderContext: + """Render context superclass for offscreen and window rendering.""" + + def __init__(self, model, data, offscreen=True): + + self.model = model + self.data = data + self.offscreen = offscreen + max_geom = 1000 + + mujoco.mj_forward(self.model, self.data) + + self.scn = mujoco.MjvScene(self.model, max_geom) + self.cam = mujoco.MjvCamera() + self.vopt = mujoco.MjvOption() + self.pert = mujoco.MjvPerturb() + self.con = mujoco.MjrContext(self.model, mujoco.mjtFontScale.mjFONTSCALE_150) + + self._markers = [] + self._overlays = {} + + self._init_camera() + self._set_mujoco_buffers() + + def _set_mujoco_buffers(self): + if self.offscreen: + mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con) + if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_OFFSCREEN: + raise RuntimeError("Offscreen rendering not supported") + else: + mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_WINDOW, self.con) + if self.con.currentBuffer != mujoco.mjtFramebuffer.mjFB_WINDOW: + raise RuntimeError("Window rendering not supported") + + def update_offscreen_size(self, width, height): + if width != self.con.offWidth or height != self.con.offHeight: + self.model.vis.global_.offwidth = width + self.model.vis.global_.offheight = height + self.con.free() + self._set_mujoco_buffers() + + def render(self, width, height, camera_id=None, segmentation=False): + rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height) + + # Sometimes buffers are too small. + if width > self.con.offWidth or height > self.con.offHeight: + new_width = max(width, self.model.vis.global_.offwidth) + new_height = max(height, self.model.vis.global_.offheight) + self.update_offscreen_size(new_width, new_height) + + if camera_id is not None: + if camera_id == -1: + self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE + else: + self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED + self.cam.fixedcamid = camera_id + + mujoco.mjv_updateScene( + self.model, + self.data, + self.vopt, + self.pert, + self.cam, + mujoco.mjtCatBit.mjCAT_ALL, + self.scn, + ) + + if segmentation: + self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1 + self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1 + + for marker_params in self._markers: + self._add_marker_to_scene(marker_params) + + mujoco.mjr_render(rect, self.scn, self.con) + + for gridpos, (text1, text2) in self._overlays.items(): + mujoco.mjr_overlay( + mujoco.mjtFontScale.mjFONTSCALE_150, + gridpos, + rect, + text1.encode(), + text2.encode(), + self.con, + ) + + if segmentation: + self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0 + self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0 + + def read_pixels(self, width, height, depth=True, segmentation=False): + rect = mujoco.MjrRect(left=0, bottom=0, width=width, height=height) + + rgb_arr = np.zeros(3 * rect.width * rect.height, dtype=np.uint8) + depth_arr = np.zeros(rect.width * rect.height, dtype=np.float32) + + mujoco.mjr_readPixels(rgb_arr, depth_arr, rect, self.con) + rgb_img = rgb_arr.reshape(rect.height, rect.width, 3) + + ret_img = rgb_img + if segmentation: + seg_img = ( + rgb_img[:, :, 0] + + rgb_img[:, :, 1] * (2**8) + + rgb_img[:, :, 2] * (2**16) + ) + seg_img[seg_img >= (self.scn.ngeom + 1)] = 0 + seg_ids = np.full((self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32) + + for i in range(self.scn.ngeom): + geom = self.scn.geoms[i] + if geom.segid != -1: + seg_ids[geom.segid + 1, 0] = geom.objtype + seg_ids[geom.segid + 1, 1] = geom.objid + ret_img = seg_ids[seg_img] + + if depth: + depth_img = depth_arr.reshape(rect.height, rect.width) + return (ret_img, depth_img) + else: + return ret_img + + def _init_camera(self): + self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE + self.cam.fixedcamid = -1 + for i in range(3): + self.cam.lookat[i] = np.median(self.data.geom_xpos[:, i]) + self.cam.distance = self.model.stat.extent + + def add_overlay(self, gridpos: int, text1: str, text2: str): + """Overlays text on the scene.""" + if gridpos not in self._overlays: + self._overlays[gridpos] = ["", ""] + self._overlays[gridpos][0] += text1 + "\n" + self._overlays[gridpos][1] += text2 + "\n" + + def add_marker(self, **marker_params): + self._markers.append(marker_params) + + def _add_marker_to_scene(self, marker): + if self.scn.ngeom >= self.scn.maxgeom: + raise RuntimeError("Ran out of geoms. maxgeom: %d" % self.scn.maxgeom) + + g = self.scn.geoms[self.scn.ngeom] + # default values. + g.dataid = -1 + g.objtype = mujoco.mjtObj.mjOBJ_UNKNOWN + g.objid = -1 + g.category = mujoco.mjtCatBit.mjCAT_DECOR + g.texid = -1 + g.texuniform = 0 + g.texrepeat[0] = 1 + g.texrepeat[1] = 1 + g.emission = 0 + g.specular = 0.5 + g.shininess = 0.5 + g.reflectance = 0 + g.type = mujoco.mjtGeom.mjGEOM_BOX + g.size[:] = np.ones(3) * 0.1 + g.mat[:] = np.eye(3) + g.rgba[:] = np.ones(4) + + for key, value in marker.items(): + if isinstance(value, (int, float, mujoco._enums.mjtGeom)): + setattr(g, key, value) + elif isinstance(value, (tuple, list, np.ndarray)): + attr = getattr(g, key) + attr[:] = np.asarray(value).reshape(attr.shape) + elif isinstance(value, str): + assert key == "label", "Only label is a string in mjtGeom." + if value is None: + g.label[0] = 0 + else: + g.label = value + elif hasattr(g, key): + raise ValueError( + "mjtGeom has attr {} but type {} is invalid".format( + key, type(value) + ) + ) + else: + raise ValueError("mjtGeom doesn't have field %s" % key) + + self.scn.ngeom += 1 + + +class RenderContextOffscreen(RenderContext): + """Offscreen rendering class with opengl context.""" + + def __init__(self, width, height, model, data): + + self._get_opengl_backend(width, height) + self.opengl_context.make_current() + + super().__init__(model, data, offscreen=True) + + def _get_opengl_backend(self, width, height): + + backend = os.environ.get("MUJOCO_GL") + if backend is not None: + try: + self.opengl_context = _ALL_RENDERERS[backend](width, height) + except KeyError: + raise RuntimeError( + "Environment variable {} must be one of {!r}: got {!r}.".format( + "MUJOCO_GL", _ALL_RENDERERS.keys(), backend + ) + ) + + else: + for name, _ in _ALL_RENDERERS.items(): + try: + self.opengl_context = _ALL_RENDERERS[name](width, height) + backend = name + break + except: # noqa:E722 + pass + if backend is None: + raise RuntimeError( + "No OpenGL backend could be imported. Attempting to create a " + "rendering context will result in a RuntimeError." + ) + + +class Viewer(RenderContext): + """Class for window rendering in all MuJoCo environments.""" + + def __init__(self, model, data): + self._gui_lock = Lock() + self._button_left_pressed = False + self._button_right_pressed = False + self._last_mouse_x = 0 + self._last_mouse_y = 0 + self._paused = False + self._transparent = False + self._contacts = False + self._render_every_frame = True + self._image_idx = 0 + self._image_path = "/tmp/frame_%07d.png" + self._time_per_render = 1 / 60.0 + self._run_speed = 1.0 + self._loop_count = 0 + self._advance_by_one_step = False + self._hide_menu = False + + # glfw init + glfw.init() + width, height = glfw.get_video_mode(glfw.get_primary_monitor()).size + self.window = glfw.create_window(width // 2, height // 2, "mujoco", None, None) + glfw.make_context_current(self.window) + glfw.swap_interval(1) + + framebuffer_width, framebuffer_height = glfw.get_framebuffer_size(self.window) + window_width, _ = glfw.get_window_size(self.window) + self._scale = framebuffer_width * 1.0 / window_width + + # set callbacks + glfw.set_cursor_pos_callback(self.window, self._cursor_pos_callback) + glfw.set_mouse_button_callback(self.window, self._mouse_button_callback) + glfw.set_scroll_callback(self.window, self._scroll_callback) + glfw.set_key_callback(self.window, self._key_callback) + + # get viewport + self.viewport = mujoco.MjrRect(0, 0, framebuffer_width, framebuffer_height) + + super().__init__(model, data, offscreen=False) + + def _key_callback(self, key, action): + if action != glfw.RELEASE: + return + # Switch cameras + elif key == glfw.KEY_TAB: + self.cam.fixedcamid += 1 + self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED + if self.cam.fixedcamid >= self.model.ncam: + self.cam.fixedcamid = -1 + self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE + # Pause simulation + elif key == glfw.KEY_SPACE and self._paused is not None: + self._paused = not self._paused + # Advances simulation by one step. + elif key == glfw.KEY_RIGHT and self._paused is not None: + self._advance_by_one_step = True + self._paused = True + # Slows down simulation + elif key == glfw.KEY_S: + self._run_speed /= 2.0 + # Speeds up simulation + elif key == glfw.KEY_F: + self._run_speed *= 2.0 + # Turn off / turn on rendering every frame. + elif key == glfw.KEY_D: + self._render_every_frame = not self._render_every_frame + # Capture screenshot + elif key == glfw.KEY_T: + img = np.zeros( + ( + glfw.get_framebuffer_size(self.window)[1], + glfw.get_framebuffer_size(self.window)[0], + 3, + ), + dtype=np.uint8, + ) + mujoco.mjr_readPixels(img, None, self.viewport, self.con) + imageio.imwrite(self._image_path % self._image_idx, np.flipud(img)) + self._image_idx += 1 + # Display contact forces + elif key == glfw.KEY_C: + self._contacts = not self._contacts + self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = self._contacts + self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = self._contacts + # Display coordinate frames + elif key == glfw.KEY_E: + self.vopt.frame = 1 - self.vopt.frame + # Hide overlay menu + elif key == glfw.KEY_H: + self._hide_menu = not self._hide_menu + # Make transparent + elif key == glfw.KEY_R: + self._transparent = not self._transparent + if self._transparent: + self.model.geom_rgba[:, 3] /= 5.0 + else: + self.model.geom_rgba[:, 3] *= 5.0 + # Geom group visibility + elif key in (glfw.KEY_0, glfw.KEY_1, glfw.KEY_2, glfw.KEY_3, glfw.KEY_4): + self.vopt.geomgroup[key - glfw.KEY_0] ^= 1 + # Quit + if key == glfw.KEY_ESCAPE: + print("Pressed ESC") + print("Quitting.") + glfw.terminate() + sys.exit(0) + + def _cursor_pos_callback(self, window, xpos, ypos): + if not (self._button_left_pressed or self._button_right_pressed): + return + + mod_shift = ( + glfw.get_key(window, glfw.KEY_LEFT_SHIFT) == glfw.PRESS + or glfw.get_key(window, glfw.KEY_RIGHT_SHIFT) == glfw.PRESS + ) + if self._button_right_pressed: + action = ( + mujoco.mjtMouse.mjMOUSE_MOVE_H + if mod_shift + else mujoco.mjtMouse.mjMOUSE_MOVE_V + ) + elif self._button_left_pressed: + action = ( + mujoco.mjtMouse.mjMOUSE_ROTATE_H + if mod_shift + else mujoco.mjtMouse.mjMOUSE_ROTATE_V + ) + else: + action = mujoco.mjtMouse.mjMOUSE_ZOOM + + dx = int(self._scale * xpos) - self._last_mouse_x + dy = int(self._scale * ypos) - self._last_mouse_y + width, height = glfw.get_framebuffer_size(window) + + with self._gui_lock: + mujoco.mjv_moveCamera( + self.model, action, dx / height, dy / height, self.scn, self.cam + ) + + self._last_mouse_x = int(self._scale * xpos) + self._last_mouse_y = int(self._scale * ypos) + + def _mouse_button_callback(self, window, button, act, mods): + self._button_left_pressed = ( + glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_LEFT) == glfw.PRESS + ) + self._button_right_pressed = ( + glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_RIGHT) == glfw.PRESS + ) + + x, y = glfw.get_cursor_pos(window) + self._last_mouse_x = int(self._scale * x) + self._last_mouse_y = int(self._scale * y) + + def _scroll_callback(self, window, x_offset, y_offset): + with self._gui_lock: + mujoco.mjv_moveCamera( + self.model, + mujoco.mjtMouse.mjMOUSE_ZOOM, + 0, + -0.05 * y_offset, + self.scn, + self.cam, + ) + + def _create_overlay(self): + topleft = mujoco.mjtGridPos.mjGRID_TOPLEFT + bottomleft = mujoco.mjtGridPos.mjGRID_BOTTOMLEFT + + if self._render_every_frame: + self.add_overlay(topleft, "", "") + else: + self.add_overlay( + topleft, + "Run speed = %.3f x real time" % self._run_speed, + "[S]lower, [F]aster", + ) + self.add_overlay( + topleft, "Ren[d]er every frame", "On" if self._render_every_frame else "Off" + ) + self.add_overlay( + topleft, + "Switch camera (#cams = %d)" % (self.model.ncam + 1), + "[Tab] (camera ID = %d)" % self.cam.fixedcamid, + ) + self.add_overlay(topleft, "[C]ontact forces", "On" if self._contacts else "Off") + self.add_overlay(topleft, "T[r]ansparent", "On" if self._transparent else "Off") + if self._paused is not None: + if not self._paused: + self.add_overlay(topleft, "Stop", "[Space]") + else: + self.add_overlay(topleft, "Start", "[Space]") + self.add_overlay( + topleft, "Advance simulation by one step", "[right arrow]" + ) + self.add_overlay( + topleft, "Referenc[e] frames", "On" if self.vopt.frame == 1 else "Off" + ) + self.add_overlay(topleft, "[H]ide Menu", "") + if self._image_idx > 0: + fname = self._image_path % (self._image_idx - 1) + self.add_overlay(topleft, "Cap[t]ure frame", "Saved as %s" % fname) + else: + self.add_overlay(topleft, "Cap[t]ure frame", "") + self.add_overlay(topleft, "Toggle geomgroup visibility", "0-4") + + self.add_overlay(bottomleft, "FPS", "%d%s" % (1 / self._time_per_render, "")) + self.add_overlay( + bottomleft, "Solver iterations", str(self.data.solver_iter + 1) + ) + self.add_overlay( + bottomleft, "Step", str(round(self.data.time / self.model.opt.timestep)) + ) + self.add_overlay(bottomleft, "timestep", "%.5f" % self.model.opt.timestep) + + def render(self): + # mjv_updateScene, mjr_render, mjr_overlay + def update(): + # fill overlay items + self._create_overlay() + + render_start = time.time() + if self.window is None: + return + elif glfw.window_should_close(self.window): + glfw.terminate() + sys.exit(0) + self.viewport.width, self.viewport.height = glfw.get_framebuffer_size( + self.window + ) + with self._gui_lock: + # update scene + mujoco.mjv_updateScene( + self.model, + self.data, + self.vopt, + mujoco.MjvPerturb(), + self.cam, + mujoco.mjtCatBit.mjCAT_ALL.value, + self.scn, + ) + # marker items + for marker in self._markers: + self._add_marker_to_scene(marker) + # render + mujoco.mjr_render(self.viewport, self.scn, self.con) + # overlay items + if not self._hide_menu: + for gridpos, [t1, t2] in self._overlays.items(): + mujoco.mjr_overlay( + mujoco.mjtFontScale.mjFONTSCALE_150, + gridpos, + self.viewport, + t1, + t2, + self.con, + ) + glfw.swap_buffers(self.window) + glfw.poll_events() + self._time_per_render = 0.9 * self._time_per_render + 0.1 * ( + time.time() - render_start + ) + + # clear overlay + self._overlays.clear() + + if self._paused: + while self._paused: + update() + if self._advance_by_one_step: + self._advance_by_one_step = False + break + else: + self._loop_count += self.model.opt.timestep / ( + self._time_per_render * self._run_speed + ) + if self._render_every_frame: + self._loop_count = 1 + while self._loop_count > 0: + update() + self._loop_count -= 1 + + # clear markers + self._markers[:] = [] + + def close(self): + glfw.terminate() + sys.exit(0) diff --git a/gym/envs/mujoco/pusher.py b/gym/envs/mujoco/pusher.py index 6e20ba42b64..ee6da42be76 100644 --- a/gym/envs/mujoco/pusher.py +++ b/gym/envs/mujoco/pusher.py @@ -5,131 +5,11 @@ class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - "Pusher" is a multi-jointed robot arm which is very similar to that of a human. - The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). - The robot consists of shoulder, elbow, forearm, and wrist joints. - - ### Action Space - The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | - | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | - | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | - | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | - | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | - | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | - | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | - - ### Observation Space - - Observations consist of - - - Angle of rotational joints on the pusher - - Anglular velocities of rotational joints on the pusher - - The coordinates of the fingertip of the pusher - - The coordinates of the object to be moved - - The coordinates of the goal position - - The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below. - An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the - same as human joints. - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | - | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | - | 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) | - | 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) | - | 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) | - | 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) | - | 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) | - | 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) | - | 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) | - | 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) | - | 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) | - | 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) | - | 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) | - | 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) | - | 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) | - | 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) | - | 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) | - | 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) | - | 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) | - | 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) | - - - ### Rewards - The reward consists of two parts: - - *reward_near *: This reward is a measure of how far the *fingertip* - of the pusher (the unattached end) is from the object, with a more negative - value assigned for when the pusher's *fingertip* is further away from the - target. It is calculated as the negative vector norm of (position of - the fingertip - position of target), or *-norm("fingertip" - "target")*. - - *reward_dist *: This reward is a measure of how far the object is from - the target goal position, with a more negative value assigned for object is - further away from the target. It is calculated as the negative vector norm of - (position of the object - position of goal), or *-norm("object" - "target")*. - - *reward_control*: A negative reward for penalising the pusher if - it takes actions that are too large. It is measured as the negative squared - Euclidean norm of the action, i.e. as *- sum(action2)*. - - The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near* - - Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms. - However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, - you should create a wrapper that computes the weighted reward from `info`. - - - ### Starting State - All pusher (not including object and goal) states start in - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range - [-0.005, 0.005] is added to the velocity attributes only. The velocities of - the object and goal are permanently set to 0. The object's x-position is selected uniformly - between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this - process is repeated until the vector norm between the object's (x,y) position and origin is not greater - than 0.17. The goal always have the same position of (0.45, -0.05, -0.323). - - The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05* - - ### Episode Termination - - The episode terminates when any of the following happens: - - 1. The episode duration reaches a 100 timesteps. - 2. Any of the state space values is no longer finite. - - ### Arguments - - No additional arguments are currently supported (in v2 and lower), - but modifications can be made to the XML file in the assets folder - (or by changing the path to a modified XML file in another folder).. - - ``` - env = gym.make('Pusher-v2') - ``` - - There is no v3 for Pusher, unlike the robot environments where a v3 and - beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - - ### Version History - - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - - """ - def __init__(self): utils.EzPickle.__init__(self) - mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5) + mujoco_env.MujocoEnv.__init__( + self, "pusher.xml", 5, mujoco_bindings="mujoco_py" + ) def step(self, a): vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") diff --git a/gym/envs/mujoco/pusher_v4.py b/gym/envs/mujoco/pusher_v4.py new file mode 100644 index 00000000000..afb4b2efb82 --- /dev/null +++ b/gym/envs/mujoco/pusher_v4.py @@ -0,0 +1,185 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + + +class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + "Pusher" is a multi-jointed robot arm which is very similar to that of a human. + The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). + The robot consists of shoulder, elbow, forearm, and wrist joints. + + ### Action Space + The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | + | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | + | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | + | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | + | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | + | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | + | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | + + ### Observation Space + + Observations consist of + + - Angle of rotational joints on the pusher + - Anglular velocities of rotational joints on the pusher + - The coordinates of the fingertip of the pusher + - The coordinates of the object to be moved + - The coordinates of the goal position + + The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below. + An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the + same as human joints. + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | + | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | + | 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) | + | 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) | + | 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) | + | 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) | + | 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) | + | 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) | + | 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) | + | 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) | + | 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) | + | 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) | + | 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) | + | 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) | + | 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) | + | 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) | + | 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) | + | 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) | + | 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) | + | 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) | + + + ### Rewards + The reward consists of two parts: + - *reward_near *: This reward is a measure of how far the *fingertip* + of the pusher (the unattached end) is from the object, with a more negative + value assigned for when the pusher's *fingertip* is further away from the + target. It is calculated as the negative vector norm of (position of + the fingertip - position of target), or *-norm("fingertip" - "target")*. + - *reward_dist *: This reward is a measure of how far the object is from + the target goal position, with a more negative value assigned for object is + further away from the target. It is calculated as the negative vector norm of + (position of the object - position of goal), or *-norm("object" - "target")*. + - *reward_control*: A negative reward for penalising the pusher if + it takes actions that are too large. It is measured as the negative squared + Euclidean norm of the action, i.e. as *- sum(action2)*. + + The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near* + + Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms. + However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, + you should create a wrapper that computes the weighted reward from `info`. + + + ### Starting State + All pusher (not including object and goal) states start in + (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range + [-0.005, 0.005] is added to the velocity attributes only. The velocities of + the object and goal are permanently set to 0. The object's x-position is selected uniformly + between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this + process is repeated until the vector norm between the object's (x,y) position and origin is not greater + than 0.17. The goal always have the same position of (0.45, -0.05, -0.323). + + The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05* + + ### Episode Termination + + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 100 timesteps. + 2. Any of the state space values is no longer finite. + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), + but modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder).. + + ``` + env = gym.make('Pusher-v2') + ``` + + There is no v3 for Pusher, unlike the robot environments where a v3 and + beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + + """ + + def __init__(self): + utils.EzPickle.__init__(self) + mujoco_env.MujocoEnv.__init__(self, "pusher.xml", 5) + + def step(self, a): + vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") + vec_2 = self.get_body_com("object") - self.get_body_com("goal") + + reward_near = -np.linalg.norm(vec_1) + reward_dist = -np.linalg.norm(vec_2) + reward_ctrl = -np.square(a).sum() + reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near + + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = -1 + self.viewer.cam.distance = 4.0 + + def reset_model(self): + qpos = self.init_qpos + + self.goal_pos = np.asarray([0, 0]) + while True: + self.cylinder_pos = np.concatenate( + [ + self.np_random.uniform(low=-0.3, high=0, size=1), + self.np_random.uniform(low=-0.2, high=0.2, size=1), + ] + ) + if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17: + break + + qpos[-4:-2] = self.cylinder_pos + qpos[-2:] = self.goal_pos + qvel = self.init_qvel + self.np_random.uniform( + low=-0.005, high=0.005, size=self.model.nv + ) + qvel[-4:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate( + [ + self.data.qpos.flat[:7], + self.data.qvel.flat[:7], + self.get_body_com("tips_arm"), + self.get_body_com("object"), + self.get_body_com("goal"), + ] + ) diff --git a/gym/envs/mujoco/reacher.py b/gym/envs/mujoco/reacher.py index 0df3a974bd3..5fd1834bcc5 100644 --- a/gym/envs/mujoco/reacher.py +++ b/gym/envs/mujoco/reacher.py @@ -5,121 +5,11 @@ class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a - target that is spawned at a random position. - - ### Action Space - The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------| - | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | - | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | - - ### Observation Space - - Observations consist of - - - The cosine of the angles of the two arms - - The sine of the angles of the two arms - - The coordinates of the target - - The angular velocities of the arms - - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) - - The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | - | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | - | 2 | sine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | - | 3 | sine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | - | 4 | x-coorddinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 5 | y-coorddinate of the target | -Inf | Inf | target_y | slide | position (m) | - | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | - | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | - | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | - | 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | - | 10 | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) | - - - Most Gym environments just return the positions and velocity of the - joints in the `.xml` file as the state of the environment. However, in - reacher the state is created by combining only certain elements of the - position and velocity, and performing some function transformations on them. - If one is to read the `.xml` for reacher then they will find 4 joints: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| - | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) | - | 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) | - | 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | - - - ### Rewards - The reward consists of two parts: - - *reward_distance*: This reward is a measure of how far the *fingertip* - of the reacher (the unattached end) is from the target, with a more negative - value assigned for when the reacher's *fingertip* is further away from the - target. It is calculated as the negative vector norm of (position of - the fingertip - position of target), or *-norm("fingertip" - "target")*. - - *reward_control*: A negative reward for penalising the walker if - it takes actions that are too large. It is measured as the negative squared - Euclidean norm of the action, i.e. as *- sum(action2)*. - - The total reward returned is ***reward*** *=* *reward_distance + reward_control* - - Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms. - However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, - you should create a wrapper that computes the weighted reward from `info`. - - - ### Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a noise added for stochasticity. A uniform noise in the range - [-0.1, 0.1] is added to the positional attributes, while the target position - is selected uniformly at random in a disk of radius 0.2 around the origin. - Independent, uniform noise in the - range of [-0.005, 0.005] is added to the velocities, and the last - element ("fingertip" - "target") is calculated at the end once everything - is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02* - - ### Episode Termination - - The episode terminates when any of the following happens: - - 1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps) - 2. Any of the state space values is no longer finite. - - ### Arguments - - No additional arguments are currently supported (in v2 and lower), - but modifications can be made to the XML file in the assets folder - (or by changing the path to a modified XML file in another folder).. - - ``` - env = gym.make('Reacher-v2') - ``` - - There is no v3 for Reacher, unlike the robot environments where a v3 and - beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - - ### Version History - - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - - """ - def __init__(self): utils.EzPickle.__init__(self) - mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2) + mujoco_env.MujocoEnv.__init__( + self, "reacher.xml", 2, mujoco_bindings="mujoco_py" + ) def step(self, a): vec = self.get_body_com("fingertip") - self.get_body_com("target") diff --git a/gym/envs/mujoco/reacher_v4.py b/gym/envs/mujoco/reacher_v4.py new file mode 100644 index 00000000000..152ba86bbd4 --- /dev/null +++ b/gym/envs/mujoco/reacher_v4.py @@ -0,0 +1,165 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + + +class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a + target that is spawned at a random position. + + ### Action Space + The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------| + | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | + | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | + + ### Observation Space + + Observations consist of + + - The cosine of the angles of the two arms + - The sine of the angles of the two arms + - The coordinates of the target + - The angular velocities of the arms + - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) + + The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | + | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | + | 2 | sine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | + | 3 | sine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | + | 4 | x-coorddinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 5 | y-coorddinate of the target | -Inf | Inf | target_y | slide | position (m) | + | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | + | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | + | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | + | 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | + | 10 | z-value of position_fingertip - position_target (0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) | + + + Most Gym environments just return the positions and velocity of the + joints in the `.xml` file as the state of the environment. However, in + reacher the state is created by combining only certain elements of the + position and velocity, and performing some function transformations on them. + If one is to read the `.xml` for reacher then they will find 4 joints: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| + | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) | + | 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) | + | 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | + + + ### Rewards + The reward consists of two parts: + - *reward_distance*: This reward is a measure of how far the *fingertip* + of the reacher (the unattached end) is from the target, with a more negative + value assigned for when the reacher's *fingertip* is further away from the + target. It is calculated as the negative vector norm of (position of + the fingertip - position of target), or *-norm("fingertip" - "target")*. + - *reward_control*: A negative reward for penalising the walker if + it takes actions that are too large. It is measured as the negative squared + Euclidean norm of the action, i.e. as *- sum(action2)*. + + The total reward returned is ***reward*** *=* *reward_distance + reward_control* + + Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms. + However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, + you should create a wrapper that computes the weighted reward from `info`. + + + ### Starting State + All observations start in state + (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + with a noise added for stochasticity. A uniform noise in the range + [-0.1, 0.1] is added to the positional attributes, while the target position + is selected uniformly at random in a disk of radius 0.2 around the origin. + Independent, uniform noise in the + range of [-0.005, 0.005] is added to the velocities, and the last + element ("fingertip" - "target") is calculated at the end once everything + is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02* + + ### Episode Termination + + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps) + 2. Any of the state space values is no longer finite. + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), + but modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder).. + + ``` + env = gym.make('Reacher-v2') + ``` + + There is no v3 for Reacher, unlike the robot environments where a v3 and + beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + + """ + + def __init__(self): + utils.EzPickle.__init__(self) + mujoco_env.MujocoEnv.__init__(self, "reacher.xml", 2) + + def step(self, a): + vec = self.get_body_com("fingertip") - self.get_body_com("target") + reward_dist = -np.linalg.norm(vec) + reward_ctrl = -np.square(a).sum() + reward = reward_dist + reward_ctrl + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 0 + + def reset_model(self): + qpos = ( + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + + self.init_qpos + ) + while True: + self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2) + if np.linalg.norm(self.goal) < 0.2: + break + qpos[-2:] = self.goal + qvel = self.init_qvel + self.np_random.uniform( + low=-0.005, high=0.005, size=self.model.nv + ) + qvel[-2:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.data.qpos.flat[:2] + return np.concatenate( + [ + np.cos(theta), + np.sin(theta), + self.data.qpos.flat[2:], + self.data.qvel.flat[:2], + self.get_body_com("fingertip") - self.get_body_com("target"), + ] + ) diff --git a/gym/envs/mujoco/swimmer.py b/gym/envs/mujoco/swimmer.py index 429852f79a8..bfa6a8895fc 100644 --- a/gym/envs/mujoco/swimmer.py +++ b/gym/envs/mujoco/swimmer.py @@ -6,7 +6,9 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "swimmer.xml", 4) + mujoco_env.MujocoEnv.__init__( + self, "swimmer.xml", 4, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def step(self, a): diff --git a/gym/envs/mujoco/swimmer_v3.py b/gym/envs/mujoco/swimmer_v3.py index db07f238f28..18f848c3678 100644 --- a/gym/envs/mujoco/swimmer_v3.py +++ b/gym/envs/mujoco/swimmer_v3.py @@ -9,119 +9,6 @@ class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis - ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document). - The environment aims to increase the number of independent state and control - variables as compared to the classic control environments. The swimmers - consist of three or more segments ('***links***') and one less articulation - joints ('***rotors***') - one rotor joint connecting exactly two links to - form a linear chain. The swimmer is suspended in a two dimensional pool and - always starts in the same position (subject to some deviation drawn from an - uniform distribution), and the goal is to move as fast as possible towards - the right by applying torque on the rotors and using the fluids friction. - - ### Notes - - The problem parameters are: - Problem parameters: - * *n*: number of body parts - * *mi*: mass of part *i* (*i* ∈ {1...n}) - * *li*: length of part *i* (*i* ∈ {1...n}) - * *k*: viscous-friction coefficient - - While the default environment has *n* = 3, *li* = 0.1, - and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the - number of links, or to tweak any of the parameters. - - ### Action Space - The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links* - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the first rotor | -1 | 1 | rot2 | hinge | torque (N m) | - | 1 | Torque applied on the second rotor | -1 | 1 | rot3 | hinge | torque (N m) | - - ### Observation Space - - By default, observations consists of: - * θi: angle of part *i* with respect to the *x* axis - * θi': its derivative with respect to time (angular velocity) - - In the default case, observations do not include the x- and y-coordinates of the front tip. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - Then, the observation space will have 10 dimensions where the first two dimensions - represent the x- and y-coordinates of the front tip. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - By default, the observation is a `ndarray` with shape `(8,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | - |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| - | 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) | - | 1 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) | - | 2 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) | - | 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | - | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | - | 5 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) | - | 6 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) | - | 7 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) | - - ### Rewards - The reward consists of two parts: - - *forward_reward*: A reward of moving forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (default is 4), where the frametime is 0.01 - making the - default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer - swims right as desired. - - *ctrl_cost*: A cost for penalising the swimmer if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 1e-4 - - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ### Starting State - All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity. - - ### Episode Termination - The episode terminates when the episode length is greater than 1000. - - ### Arguments - - No additional arguments are currently supported in v2 and lower. - - ``` - gym.make('Swimmer-v2') - ``` - - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ``` - env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|--------------|-------------------------------| - | `xml_file` | **str** | `"swimmer.xml"`| Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-4` | Weight for *ctrl_cost* term (see section on reward) | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ### Version History - - * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - def __init__( self, xml_file="swimmer.xml", @@ -141,7 +28,7 @@ def __init__( exclude_current_positions_from_observation ) - mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py") def control_cost(self, action): control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) diff --git a/gym/envs/mujoco/swimmer_v4.py b/gym/envs/mujoco/swimmer_v4.py new file mode 100644 index 00000000000..76c380eb428 --- /dev/null +++ b/gym/envs/mujoco/swimmer_v4.py @@ -0,0 +1,217 @@ +__credits__ = ["Rushiv Arora"] + +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + +DEFAULT_CAMERA_CONFIG = {} + + +class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis + ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document). + The environment aims to increase the number of independent state and control + variables as compared to the classic control environments. The swimmers + consist of three or more segments ('***links***') and one less articulation + joints ('***rotors***') - one rotor joint connecting exactly two links to + form a linear chain. The swimmer is suspended in a two dimensional pool and + always starts in the same position (subject to some deviation drawn from an + uniform distribution), and the goal is to move as fast as possible towards + the right by applying torque on the rotors and using the fluids friction. + + ### Notes + + The problem parameters are: + Problem parameters: + * *n*: number of body parts + * *mi*: mass of part *i* (*i* ∈ {1...n}) + * *li*: length of part *i* (*i* ∈ {1...n}) + * *k*: viscous-friction coefficient + + While the default environment has *n* = 3, *li* = 0.1, + and *k* = 0.1. It is possible to tweak the MuJoCo XML files to increase the + number of links, or to tweak any of the parameters. + + ### Action Space + The agent take a 2-element vector for actions. + The action space is a continuous `(action, action)` in `[-1, 1]`, where + `action` represents the numerical torques applied between *links* + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the first rotor | -1 | 1 | rot2 | hinge | torque (N m) | + | 1 | Torque applied on the second rotor | -1 | 1 | rot3 | hinge | torque (N m) | + + ### Observation Space + + The state space consists of: + * A0: position of first point + * θi: angle of part *i* with respect to the *x* axis + * A0, θi: their derivatives with respect to time (velocity and angular velocity) + + The observation is a `ndarray` with shape `(8,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | x-coordinate of the front tip | -Inf | Inf | slider1 | slide | position (m) | + | 1 | y-coordinate of the front tip | -Inf | Inf | slider2 | slide | position (m) | + | 2 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) | + | 3 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) | + | 4 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) | + | 5 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | + | 6 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | + | 7 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) | + | 8 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) | + | 9 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) | + + **Note:** + In practice (and Gym implementation), the first two positional elements are + omitted from the state space since the reward function is calculated based + on those values. Therefore, observation space has shape `(8,)` and looks like: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint| Unit | + |-----|-----------------------|----------------------|--------------------|----------------------|--------------------|--------------------| + | 0 | angle of the front tip | -Inf | Inf | rot | hinge | angle (rad) | + | 1 | angle of the second rotor | -Inf | Inf | rot2 | hinge | angle (rad) | + | 2 | angle of the second rotor | -Inf | Inf | rot3 | hinge | angle (rad) | + | 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | + | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | + | 5 | angular velocity of front tip | -Inf | Inf | rot | hinge | angular velocity (rad/s) | + | 6 | angular velocity of second rotor | -Inf | Inf | rot2 | hinge | angular velocity (rad/s) | + | 7 | angular velocity of third rotor | -Inf | Inf | rot3 | hinge | angular velocity (rad/s) | + + ### Rewards + The reward consists of two parts: + - *reward_front*: A reward of moving forward which is measured + as *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is + the time between actions and is dependeent on the frame_skip parameter + (default is 4), where the *dt* for one frame is 0.01 - making the + default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer + swims right as desired. + - *reward_control*: A negative reward for penalising the swimmer if it takes + actions that are too large. It is measured as *-coefficient x + sum(action2)* where *coefficient* is a parameter set for the + control and has a default value of 0.0001 + + The total reward returned is ***reward*** *=* *reward_front + reward_control* + + ### Starting State + All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-0.1, 0.1] is added to the initial state for stochasticity. + + ### Episode Termination + The episode terminates when the episode length is greater than 1000. + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), but + modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder). + + ``` + gym.make('Swimmer-v2') + ``` + + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + ``` + env = gym.make('Swimmer-v3', ctrl_cost_weight=0.1, ....) + ``` + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + def __init__( + self, + xml_file="swimmer.xml", + forward_reward_weight=1.0, + ctrl_cost_weight=1e-4, + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True, + ): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + xy_position_before = self.data.qpos[0:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.data.qpos[0:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = self._forward_reward_weight * x_velocity + + ctrl_cost = self.control_cost(action) + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + done = False + info = { + "reward_fwd": forward_reward, + "reward_ctrl": -ctrl_cost, + "x_position": xy_position_after[0], + "y_position": xy_position_after[1], + "distance_from_origin": np.linalg.norm(xy_position_after, ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + "forward_reward": forward_reward, + } + + return observation, reward, done, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + observation = np.concatenate([position, velocity]).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/walker2d.py b/gym/envs/mujoco/walker2d.py index 915ff45f7f1..fc35d633bd0 100644 --- a/gym/envs/mujoco/walker2d.py +++ b/gym/envs/mujoco/walker2d.py @@ -6,7 +6,9 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): - mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4) + mujoco_env.MujocoEnv.__init__( + self, "walker2d.xml", 4, mujoco_bindings="mujoco_py" + ) utils.EzPickle.__init__(self) def step(self, a): diff --git a/gym/envs/mujoco/walker2d_v3.py b/gym/envs/mujoco/walker2d_v3.py index 2c091810f28..a03ef972453 100644 --- a/gym/envs/mujoco/walker2d_v3.py +++ b/gym/envs/mujoco/walker2d_v3.py @@ -12,137 +12,6 @@ class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): - """ - ### Description - - This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov - in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf) - by adding another set of legs making it possible for the robot to walker forward instead of - hop. Like other Mujoco environments, this environment aims to increase the number of independent state - and control variables as compared to the classic control environments. The walker is a - two-dimensional two-legged figure that consist of four main body parts - a single torso at the top - (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs - in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. - The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right) - direction by applying torques on the six hinges connecting the six body parts. - - ### Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | - | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | - | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | - - ### Observation Space - - Observations consist of positional values of different body parts of the walker, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the top. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 18 dimensions where the first dimension - represent the x-coordinates of the top of the walker. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - of the top will be returned in `info` with key `"x_position"`. - - By default, observation is a `ndarray` with shape `(17,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| - | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) | - | 1 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | - | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | - | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | - | 8 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | - | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | - | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | - - ### Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, - - *forward_reward*: A reward of walking forward which is measured as - *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. - *dt* is the time between actions and is dependeent on the frame_skip parameter - (default is 4), where the frametime is 0.002 - making the default - *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired. - - *ctrl_cost*: A cost for penalising the walker if it - takes actions that are too large. It is measured as - *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is - a parameter set for the control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ### Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ### Episode Termination - The walker is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` - 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode terminates when any of the following happens: - - 1. The episode duration reaches a 1000 timesteps - 2. The walker is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is terminated only when 1000 timesteps are exceeded. - - ### Arguments - - No additional arguments are currently supported in v2 and lower. - - ``` - env = gym.make('Walker2d-v2') - ``` - - v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ``` - env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|--------------|-------------------------------| - | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for *forward_reward* term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-3` | Weight for *ctr_cost* term (see section on reward) | - | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | - | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the top of the walker must be in this range to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy| - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ### Version History - - * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco_py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - - """ - def __init__( self, xml_file="walker2d.xml", @@ -172,7 +41,7 @@ def __init__( exclude_current_positions_from_observation ) - mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + mujoco_env.MujocoEnv.__init__(self, xml_file, 4, mujoco_bindings="mujoco_py") @property def healthy_reward(self): diff --git a/gym/envs/mujoco/walker2d_v4.py b/gym/envs/mujoco/walker2d_v4.py new file mode 100644 index 00000000000..df5706bef05 --- /dev/null +++ b/gym/envs/mujoco/walker2d_v4.py @@ -0,0 +1,276 @@ +import numpy as np + +from gym import utils +from gym.envs.mujoco import mujoco_env + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + + +class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): + """ + ### Description + + This environment builds on the hopper environment based on the work done by Erez, Tassa, and Todorov + in ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf) + by adding another set of legs making it possiible for the robot to walker forward instead of + hop. Like other Mujoco environments, this environment aims to increase the number of independent state + and control variables as compared to the classic control environments. The walker is a + two-dimensional two-legged figure that consist of four main body parts - a single torso at the top + (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs + in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. + The goal is to make coordinate both sets of feet, legs, and thighs to move in the forward (right) + direction by applying torques on the six hinges connecting the six body parts. + + ### Action Space + The agent take a 6-element vector for actions. + The action space is a continuous `(action, action, action, action, action, action)` + all in `[-1, 1]`, where `action` represents the numerical torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | + | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | + | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | + + ### Observation Space + + The state space consists of positional values of different body parts of the walker, + followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(17,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| + | 0 | x-coordinate of the top | -Inf | Inf | rootx (torso) | slide | position (m) | + | 1 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) | + | 2 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) | + | 3 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 4 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 5 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 6 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | + | 7 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | + | 8 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | + | 9 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | + | 10 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | + | 11 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 12 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 13 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 14 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 15 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | + | 16 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | + | 17 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | + + + + **Note:** + In practice (and Gym implementation), the first positional element is omitted from the + state space since the reward function is calculated based on that value. This value is + hidden from the algorithm, which in turn has to develop an abstract understanding of it + from the observed rewards. Therefore, observation space has shape `(17,)` + instead of `(18,)` and looks like: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|--------------------------------------------------------|----------------|-----------------|----------------------------------------|-------|------| + | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz (torso) | slide | position (m) | + | 1 | angle of the top | -Inf | Inf | rooty (torso) | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | + | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | + | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | + | 8 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | + | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | + | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | + + ### Rewards + The reward consists of three parts: + - *alive bonus*: Every timestep that the walker is alive, it gets a reward of 1, + - *reward_forward*: A reward of walking forward which is measured as + *(x-coordinate before action - x-coordinate after action)/dt*. + *dt* is the time between actions and is dependeent on the frame_skip parameter + (default is 4), where the *dt* for one frame is 0.002 - making the default + *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (right) desired. + - *reward_control*: A negative reward for penalising the walker if it + takes actions that are too large. It is measured as + *-coefficient **x** sum(action2)* where *coefficient* is + a parameter set for the control and has a default value of 0.001 + + The total reward returned is ***reward*** *=* *alive bonus + reward_forward + reward_control* + + ### Starting State + All observations start in state + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + with a uniform noise in the range of [-0.005, 0.005] added to the values for stochasticity. + + ### Episode Termination + The episode terminates when any of the following happens: + + 1. The episode duration reaches a 1000 timesteps + 2. Any of the state space values is no longer finite + 3. The height of the walker (index 1) is ***not*** in the range `[0.8, 2]` + 4. The absolute value of the angle (index 2) is ***not*** in the range `[-1, 1]` + + ### Arguments + + No additional arguments are currently supported (in v2 and lower), + but modifications can be made to the XML file in the assets folder + (or by changing the path to a modified XML file in another folder).. + + ``` + env = gym.make('Walker2d-v2') + ``` + + v3 and beyond take gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. + + ``` + env = gym.make('Walker2d-v3', ctrl_cost_weight=0.1, ....) + ``` + + ### Version History + + * v4: all mujoco environments now use the mujoco binidings in mujoco>=2.1.3 + * v3: support for gym.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco_py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + + """ + + def __init__( + self, + xml_file="walker2d.xml", + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.8, 2.0), + healthy_angle_range=(-1.0, 1.0), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True, + ): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + + @property + def healthy_reward(self): + return ( + float(self.is_healthy or self._terminate_when_unhealthy) + * self._healthy_reward + ) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.data.qpos[1:3] + + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + is_healthy = healthy_z and healthy_angle + + return is_healthy + + @property + def done(self): + done = not self.is_healthy if self._terminate_when_unhealthy else False + return done + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = np.clip(self.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + done = self.done + info = { + "x_position": x_position_after, + "x_velocity": x_velocity, + } + + return observation, reward, done, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/py.Dockerfile b/py.Dockerfile index f6221a9f11f..ce20b8430b9 100644 --- a/py.Dockerfile +++ b/py.Dockerfile @@ -6,15 +6,14 @@ RUN apt-get -y update && apt-get install -y unzip libglu1-mesa-dev libgl1-mesa-d # Download mujoco RUN mkdir /root/.mujoco && \ cd /root/.mujoco && \ - curl -O https://www.roboti.us/download/mjpro150_linux.zip && \ - unzip mjpro150_linux.zip && \ - echo DUMMY_KEY > /root/.mujoco/mjkey.txt + wget https://github.com/deepmind/mujoco/releases/download/2.1.0/mujoco210-linux-x86_64.tar.gz &&\ + tar -xf mujoco210-linux-x86_64.tar.gz -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mjpro150/bin +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco210/bin COPY . /usr/local/gym/ WORKDIR /usr/local/gym/ -RUN pip install .[nomujoco] && pip install -r test_requirements.txt +RUN pip install .[noatari] && pip install -r test_requirements.txt ENTRYPOINT ["/usr/local/gym/bin/docker_entrypoint"] diff --git a/requirements.txt b/requirements.txt index 5055547c90a..0e5e5f91c15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ ale-py~=0.7 opencv-python>=3.0 box2d-py==2.3.5 -mujoco_py>=1.50, <2.0 scipy>=1.4.1 numpy>=1.18.0 pyglet>=1.4.0 diff --git a/setup.py b/setup.py index 7cd9d632139..ff4b817be6a 100644 --- a/setup.py +++ b/setup.py @@ -15,19 +15,25 @@ "accept-rom-license": ["autorom[accept-rom-license]~=0.4.2"], "box2d": ["box2d-py==2.3.5", "pygame==2.1.0"], "classic_control": ["pygame==2.1.0"], - "mujoco": ["mujoco_py>=1.50, <2.0"], + "mujoco_py": ["mujoco_py<2.2,>=2.1"], + "mujoco": ["mujoco==2.1.5", "imageio>=2.14.1"], "toy_text": ["pygame==2.1.0", "scipy>=1.4.1"], "other": ["lz4>=3.1.0", "opencv-python>=3.0", "matplotlib>=3.0"], } # Meta dependency groups. -nomujoco_blacklist = {"mujoco", "accept-rom-license", "atari"} +nomujoco_blacklist = {"mujoco_py", "mujoco", "accept-rom-license", "atari"} nomujoco_groups = set(extras.keys()) - nomujoco_blacklist extras["nomujoco"] = list( itertools.chain.from_iterable(map(lambda group: extras[group], nomujoco_groups)) ) +noatari_blacklist = {"accept-rom-license", "atari"} +noatari_groups = set(extras.keys()) - noatari_blacklist +extras["noatari"] = list( + itertools.chain.from_iterable(map(lambda group: extras[group], noatari_groups)) +) all_blacklist = {"accept-rom-license"} all_groups = set(extras.keys()) - all_blacklist diff --git a/tests/envs/spec_list.py b/tests/envs/spec_list.py index a0f192b7c2a..f7a1a2777be 100644 --- a/tests/envs/spec_list.py +++ b/tests/envs/spec_list.py @@ -1,19 +1,14 @@ -import os - from gym import envs, logger -SKIP_MUJOCO_WARNING_MESSAGE = ( - "Cannot run mujoco test (either license key not found or mujoco not" - "installed properly)." +SKIP_MUJOCO_V3_WARNING_MESSAGE = ( + "Cannot run mujoco test because mujoco-py is not installed" ) - -skip_mujoco = not (os.environ.get("MUJOCO_KEY")) -if not skip_mujoco: - try: - import mujoco_py # noqa:F401 - except ImportError: - skip_mujoco = True +skip_mujoco_v3 = False +try: + import mujoco_py # noqa:F401 +except ImportError: + skip_mujoco_v3 = True def should_skip_env_spec_for_tests(spec): @@ -21,7 +16,7 @@ def should_skip_env_spec_for_tests(spec): # troublesome to run frequently ep = spec.entry_point # Skip mujoco tests for pull request CI - if skip_mujoco and ep.startswith("gym.envs.mujoco"): + if skip_mujoco_v3 and ep.startswith("gym.envs.mujoco"): return True try: import gym.envs.atari # noqa:F401 @@ -48,8 +43,19 @@ def should_skip_env_spec_for_tests(spec): return False +def skip_mujoco_py_env_for_test(spec): + ep = spec.entry_point + version = spec.version + if ep.startswith("gym.envs.mujoco") and version < 4: + return True + return False + + spec_list = [ spec for spec in sorted(envs.registry.values(), key=lambda x: x.id) if spec.entry_point is not None and not should_skip_env_spec_for_tests(spec) ] +spec_list_no_mujoco_py = [ + spec for spec in spec_list if not skip_mujoco_py_env_for_test(spec) +] diff --git a/tests/envs/test_action_dim_check.py b/tests/envs/test_action_dim_check.py index dd1e3d51d1a..8289673c47e 100644 --- a/tests/envs/test_action_dim_check.py +++ b/tests/envs/test_action_dim_check.py @@ -8,17 +8,19 @@ from gym.spaces.box import Box from gym.spaces.discrete import Discrete from gym.spaces.space import Space -from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco, spec_list +from tests.envs.spec_list import ( + SKIP_MUJOCO_V3_WARNING_MESSAGE, + skip_mujoco_v3, + spec_list, +) ENVIRONMENT_IDS = ("HalfCheetah-v2",) def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space): """Make environments of specific action_space type. - This function returns a filtered list of environment from the spec_list that matches the action_space type. - Args: spec_list (list): list of registered environments' specification action_space (gym.spaces.Space): action_space type @@ -31,7 +33,7 @@ def make_envs_by_action_space_type(spec_list: List[EnvSpec], action_space: Space return filtered_envs -@pytest.mark.skipif(skip_mujoco, reason=SKIP_MUJOCO_WARNING_MESSAGE) +@pytest.mark.skipif(skip_mujoco_v3, reason=SKIP_MUJOCO_V3_WARNING_MESSAGE) @pytest.mark.parametrize("environment_id", ENVIRONMENT_IDS) def test_serialize_deserialize(environment_id): env = envs.make(environment_id) @@ -47,10 +49,8 @@ def test_serialize_deserialize(environment_id): @pytest.mark.parametrize("env", make_envs_by_action_space_type(spec_list, Discrete)) def test_discrete_actions_out_of_bound(env): """Test out of bound actions in Discrete action_space. - In discrete action_space environments, `out-of-bound` actions are not allowed and should raise an exception. - Args: env (gym.Env): the gym environment """ @@ -69,11 +69,9 @@ def test_discrete_actions_out_of_bound(env): ) def test_box_actions_out_of_bound(env, seed): """Test out of bound actions in Box action_space. - Environments with Box actions spaces perform clipping inside `step`. The expected behaviour is that an action `out-of-bound` has the same effect of an action with value exactly at the upper (or lower) bound. - Args: env (gym.Env): the gym environment seed (int): seed value for determinism diff --git a/tests/envs/test_envs.py b/tests/envs/test_envs.py index c76cfa7b19c..8be72caa9dd 100644 --- a/tests/envs/test_envs.py +++ b/tests/envs/test_envs.py @@ -4,7 +4,7 @@ from gym import envs from gym.spaces import Box from gym.utils.env_checker import check_env -from tests.envs.spec_list import spec_list +from tests.envs.spec_list import spec_list, spec_list_no_mujoco_py # This runs a smoketest on each official registered env. We may want @@ -13,7 +13,9 @@ @pytest.mark.filterwarnings( "ignore:.*We recommend you to use a symmetric and normalized Box action space.*" ) -@pytest.mark.parametrize("spec", spec_list, ids=[spec.id for spec in spec_list]) +@pytest.mark.parametrize( + "spec", spec_list_no_mujoco_py, ids=[spec.id for spec in spec_list_no_mujoco_py] +) def test_env(spec): # Capture warnings with pytest.warns(None) as warnings: @@ -47,13 +49,15 @@ def test_env(spec): assert ( observation.dtype == ob_space.dtype ), f"Step observation dtype: {ob.dtype}, expected: {ob_space.dtype}" - for mode in env.metadata.get("render_modes", []): - env.render(mode=mode) + if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")): + env.render(mode=mode) # Make sure we can render the environment after close. for mode in env.metadata.get("render_modes", []): - env.render(mode=mode) + if not (mode == "human" and spec.entry_point.startswith("gym.envs.mujoco")): + + env.render(mode=mode) env.close() diff --git a/tests/envs/test_mujoco_v2_to_v3_conversion.py b/tests/envs/test_mujoco_v2_to_v3_conversion.py index 201d497667e..db540024388 100644 --- a/tests/envs/test_mujoco_v2_to_v3_conversion.py +++ b/tests/envs/test_mujoco_v2_to_v3_conversion.py @@ -3,7 +3,7 @@ import numpy as np from gym import envs -from tests.envs.spec_list import SKIP_MUJOCO_WARNING_MESSAGE, skip_mujoco +from tests.envs.spec_list import SKIP_MUJOCO_V3_WARNING_MESSAGE, skip_mujoco_v3 def verify_environments_match( @@ -31,7 +31,7 @@ def verify_environments_match( np.testing.assert_allclose(old_info[key], new_info[key], atol=eps) -@unittest.skipIf(skip_mujoco, SKIP_MUJOCO_WARNING_MESSAGE) +@unittest.skipIf(skip_mujoco_v3, SKIP_MUJOCO_V3_WARNING_MESSAGE) class Mujocov2Tov3ConversionTest(unittest.TestCase): def test_environments_match(self): test_cases = (