diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index 304cd0f8379..6dc0aa5bcc7 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -252,6 +252,13 @@ reward_threshold=4800.0, ) +register( + id='HalfCheetah-v3', + entry_point='gym.envs.mujoco.half_cheetah_v3:HalfCheetahEnv', + max_episode_steps=1000, + reward_threshold=4800.0, +) + register( id='Hopper-v2', entry_point='gym.envs.mujoco:HopperEnv', @@ -259,6 +266,13 @@ reward_threshold=3800.0, ) +register( + id='Hopper-v3', + entry_point='gym.envs.mujoco.hopper_v3:HopperEnv', + max_episode_steps=1000, + reward_threshold=3800.0, +) + register( id='Swimmer-v2', entry_point='gym.envs.mujoco:SwimmerEnv', @@ -266,12 +280,25 @@ reward_threshold=360.0, ) +register( + id='Swimmer-v3', + entry_point='gym.envs.mujoco.swimmer_v3:SwimmerEnv', + max_episode_steps=1000, + reward_threshold=360.0, +) + register( id='Walker2d-v2', max_episode_steps=1000, entry_point='gym.envs.mujoco:Walker2dEnv', ) +register( + id='Walker2d-v3', + max_episode_steps=1000, + entry_point='gym.envs.mujoco.walker2d_v3:Walker2dEnv', +) + register( id='Ant-v2', entry_point='gym.envs.mujoco:AntEnv', @@ -279,12 +306,25 @@ reward_threshold=6000.0, ) +register( + id='Ant-v3', + entry_point='gym.envs.mujoco.ant_v3:AntEnv', + max_episode_steps=1000, + reward_threshold=6000.0, +) + register( id='Humanoid-v2', entry_point='gym.envs.mujoco:HumanoidEnv', max_episode_steps=1000, ) +register( + id='Humanoid-v3', + entry_point='gym.envs.mujoco.humanoid_v3:HumanoidEnv', + max_episode_steps=1000, +) + register( id='HumanoidStandup-v2', entry_point='gym.envs.mujoco:HumanoidStandupEnv', @@ -551,4 +591,3 @@ def _merge(a, b): entry_point='gym.envs.unittest:MemorizeDigits', reward_threshold=20, ) - diff --git a/gym/envs/mujoco/ant_v3.py b/gym/envs/mujoco/ant_v3.py new file mode 100644 index 00000000000..2df5b9ad98a --- /dev/null +++ b/gym/envs/mujoco/ant_v3.py @@ -0,0 +1,146 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + + +DEFAULT_CAMERA_CONFIG = { + 'distance': 4.0, +} + + +class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, + xml_file='ant.xml', + ctrl_cost_weight=0.5, + contact_cost_weight=5e-4, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.2, 1.0), + contact_force_range=(-1.0, 1.0), + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True): + utils.EzPickle.__init__(**locals()) + + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._contact_force_range = contact_force_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + @property + def healthy_reward(self): + return float( + self.is_healthy + or self._terminate_when_unhealthy + ) * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def contact_forces(self): + raw_contact_forces = self.sim.data.cfrc_ext + min_value, max_value = self._contact_force_range + contact_forces = np.clip(raw_contact_forces, min_value, max_value) + return contact_forces + + @property + def contact_cost(self): + contact_cost = self._contact_cost_weight * np.sum( + np.square(self.contact_forces)) + return contact_cost + + @property + def is_healthy(self): + state = self.state_vector() + min_z, max_z = self._healthy_z_range + is_healthy = (np.isfinite(state).all() and min_z <= state[2] <= max_z) + return is_healthy + + @property + def done(self): + done = (not self.is_healthy + if self._terminate_when_unhealthy + else False) + return done + + def step(self, action): + xy_position_before = self.get_body_com("torso")[:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.get_body_com("torso")[:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + + forward_reward = x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + contact_cost + + reward = rewards - costs + done = self.done + observation = self._get_obs() + info = { + 'reward_forward': forward_reward, + 'reward_ctrl': -ctrl_cost, + 'reward_contact': -contact_cost, + 'reward_survive': healthy_reward, + + 'x_position': xy_position_after[0], + 'y_position': xy_position_after[1], + 'distance_from_origin': np.linalg.norm(xy_position_after, ord=2), + + 'x_velocity': x_velocity, + 'y_velocity': y_velocity, + 'forward_reward': forward_reward, + } + + return observation, reward, done, info + + def _get_obs(self): + position = self.sim.data.qpos.flat.copy() + velocity = self.sim.data.qvel.flat.copy() + contact_force = self.contact_forces.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + observations = np.concatenate((position, velocity, contact_force)) + + return observations + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel + self._reset_noise_scale * self.np_random.randn( + self.model.nv) + self.set_state(qpos, qvel) + + observation = self._get_obs() + + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/half_cheetah_v3.py b/gym/envs/mujoco/half_cheetah_v3.py new file mode 100644 index 00000000000..461b47377e4 --- /dev/null +++ b/gym/envs/mujoco/half_cheetah_v3.py @@ -0,0 +1,88 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + + +DEFAULT_CAMERA_CONFIG = { + 'distance': 4.0, +} + + +class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, + xml_file='half_cheetah.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=0.1, + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + x_position_before = self.sim.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.sim.data.qpos[0] + x_velocity = ((x_position_after - x_position_before) + / self.dt) + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + done = False + info = { + 'x_position': x_position_after, + 'x_velocity': x_velocity, + + 'reward_run': forward_reward, + 'reward_ctrl': -ctrl_cost + } + + return observation, reward, done, info + + def _get_obs(self): + position = self.sim.data.qpos.flat.copy() + velocity = self.sim.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel + self._reset_noise_scale * self.np_random.randn( + self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/hopper_v3.py b/gym/envs/mujoco/hopper_v3.py new file mode 100644 index 00000000000..f049218778c --- /dev/null +++ b/gym/envs/mujoco/hopper_v3.py @@ -0,0 +1,137 @@ +import numpy as np +from gym.envs.mujoco import mujoco_env +from gym import utils + + +DEFAULT_CAMERA_CONFIG = { + 'trackbodyid': 2, + 'distance': 3.0, + 'lookat': np.array((0.0, 0.0, 1.15)), + 'elevation': -20.0, +} + + +class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, + xml_file='hopper.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.7, float('inf')), + healthy_angle_range=(-0.2, 0.2), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_state_range = healthy_state_range + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + + @property + def healthy_reward(self): + return float( + self.is_healthy + or self._terminate_when_unhealthy + ) * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.sim.data.qpos[1:3] + state = self.state_vector()[2:] + + min_state, max_state = self._healthy_state_range + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_state = np.all( + np.logical_and(min_state < state, state < max_state)) + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + + is_healthy = all((healthy_state, healthy_z, healthy_angle)) + + return is_healthy + + @property + def done(self): + done = (not self.is_healthy + if self._terminate_when_unhealthy + else False) + return done + + def _get_obs(self): + position = self.sim.data.qpos.flat.copy() + velocity = np.clip( + self.sim.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.sim.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.sim.data.qpos[0] + x_velocity = ((x_position_after - x_position_before) + / self.dt) + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + done = self.done + info = { + 'x_position': x_position_after, + 'x_velocity': x_velocity, + } + + return observation, reward, done, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/humanoid_v3.py b/gym/envs/mujoco/humanoid_v3.py new file mode 100644 index 00000000000..fd62c3c420f --- /dev/null +++ b/gym/envs/mujoco/humanoid_v3.py @@ -0,0 +1,164 @@ +import numpy as np +from gym.envs.mujoco import mujoco_env +from gym import utils + + +DEFAULT_CAMERA_CONFIG = { + 'trackbodyid': 1, + 'distance': 4.0, + 'lookat': np.array((0.0, 0.0, 2.0)), + 'elevation': -20.0, +} + + +def mass_center(model, sim): + mass = np.expand_dims(model.body_mass, axis=1) + xpos = sim.data.xipos + return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() + + +class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, + xml_file='humanoid.xml', + forward_reward_weight=0.25, + ctrl_cost_weight=0.1, + contact_cost_weight=5e-7, + contact_cost_range=(-np.inf, 10.0), + healthy_reward=5.0, + terminate_when_unhealthy=True, + healthy_z_range=(1.0, 2.0), + reset_noise_scale=1e-2, + exclude_current_positions_from_observation=True): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + self._contact_cost_range = contact_cost_range + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + @property + def healthy_reward(self): + return float( + self.is_healthy + or self._terminate_when_unhealthy + ) * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum( + np.square(self.sim.data.ctrl)) + return control_cost + + @property + def contact_cost(self): + contact_forces = self.sim.data.cfrc_ext + contact_cost = self._contact_cost_weight * np.sum( + np.square(contact_forces)) + min_cost, max_cost = self._contact_cost_range + contact_cost = np.clip(contact_cost, min_cost, max_cost) + return contact_cost + + @property + def is_healthy(self): + min_z, max_z = self._healthy_z_range + is_healthy = min_z < self.sim.data.qpos[2] < max_z + + return is_healthy + + @property + def done(self): + done = ((not self.is_healthy) + if self._terminate_when_unhealthy + else False) + return done + + def _get_obs(self): + position = self.sim.data.qpos.flat.copy() + velocity = self.sim.data.qvel.flat.copy() + + com_inertia = self.sim.data.cinert.flat.copy() + com_velocity = self.sim.data.cvel.flat.copy() + + actuator_forces = self.sim.data.qfrc_actuator.flat.copy() + external_contact_forces = self.sim.data.cfrc_ext.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + return np.concatenate(( + position, + velocity, + com_inertia, + com_velocity, + actuator_forces, + external_contact_forces, + )) + + def step(self, action): + xy_position_before = mass_center(self.model, self.sim) + self.do_simulation(action, self.frame_skip) + xy_position_after = mass_center(self.model, self.sim) + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + x_velocity = ((xy_position_after[0] - xy_position_before[0]) + / self.model.opt.timestep) + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + contact_cost + + observation = self._get_obs() + reward = rewards - costs + done = self.done + info = { + 'reward_linvel': forward_reward, + 'reward_quadctrl': -ctrl_cost, + 'reward_alive': healthy_reward, + 'reward_impact': -contact_cost, + + 'x_position': xy_position_after[0], + 'y_position': xy_position_after[1], + 'distance_from_origin': np.linalg.norm(xy_position_after, ord=2), + + 'x_velocity': x_velocity, + 'y_velocity': y_velocity, + 'forward_reward': forward_reward, + } + + return observation, reward, done, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv) + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/swimmer_v3.py b/gym/envs/mujoco/swimmer_v3.py new file mode 100644 index 00000000000..df00c41e25f --- /dev/null +++ b/gym/envs/mujoco/swimmer_v3.py @@ -0,0 +1,91 @@ +import numpy as np +from gym.envs.mujoco import mujoco_env +from gym import utils + + +DEFAULT_CAMERA_CONFIG = {} + + +class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, + xml_file='swimmer.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-4, + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + xy_position_before = self.sim.data.qpos[0:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.sim.data.qpos[0:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = self._forward_reward_weight * x_velocity + + ctrl_cost = self.control_cost(action) + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + done = False + info = { + 'reward_fwd': forward_reward, + 'reward_ctrl': -ctrl_cost, + + 'x_position': xy_position_after[0], + 'y_position': xy_position_after[1], + 'distance_from_origin': np.linalg.norm(xy_position_after, ord=2), + + 'x_velocity': x_velocity, + 'y_velocity': y_velocity, + 'forward_reward': forward_reward, + } + + return observation, reward, done, info + + def _get_obs(self): + position = self.sim.data.qpos.flat.copy() + velocity = self.sim.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + observation = np.concatenate([position, velocity]).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/mujoco/walker2d_v3.py b/gym/envs/mujoco/walker2d_v3.py new file mode 100644 index 00000000000..ae4682e04b3 --- /dev/null +++ b/gym/envs/mujoco/walker2d_v3.py @@ -0,0 +1,129 @@ +import numpy as np +from gym.envs.mujoco import mujoco_env +from gym import utils + + +DEFAULT_CAMERA_CONFIG = { + 'trackbodyid': 2, + 'distance': 4.0, + 'lookat': np.array((0.0, 0.0, 1.15)), + 'elevation': -20.0, +} + + +class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, + xml_file='walker2d.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.8, 2.0), + healthy_angle_range=(-1.0, 1.0), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True): + utils.EzPickle.__init__(**locals()) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 4) + + @property + def healthy_reward(self): + return float( + self.is_healthy + or self._terminate_when_unhealthy + ) * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.sim.data.qpos[1:3] + + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + is_healthy = healthy_z and healthy_angle + + return is_healthy + + @property + def done(self): + done = (not self.is_healthy + if self._terminate_when_unhealthy + else False) + return done + + def _get_obs(self): + position = self.sim.data.qpos.flat.copy() + velocity = np.clip( + self.sim.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.sim.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.sim.data.qpos[0] + x_velocity = ((x_position_after - x_position_before) + / self.dt) + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + done = self.done + info = { + 'x_position': x_position_after, + 'x_velocity': x_velocity, + } + + return observation, reward, done, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) diff --git a/gym/envs/tests/spec_list.py b/gym/envs/tests/spec_list.py index 8eb872afa55..2d64deafb44 100644 --- a/gym/envs/tests/spec_list.py +++ b/gym/envs/tests/spec_list.py @@ -1,18 +1,19 @@ from gym import envs, logger import os +skip_mujoco = not (os.environ.get('MUJOCO_KEY')) +if not skip_mujoco: + try: + import mujoco_py + except ImportError: + skip_mujoco = True + def should_skip_env_spec_for_tests(spec): # We skip tests for envs that require dependencies or are otherwise # troublesome to run frequently ep = spec._entry_point # Skip mujoco tests for pull request CI - skip_mujoco = not (os.environ.get('MUJOCO_KEY')) - if not skip_mujoco: - try: - import mujoco_py - except ImportError: - skip_mujoco = True - if skip_mujoco and (ep.startswith('gym.envs.mujoco:') or ep.startswith('gym.envs.robotics:')): + if skip_mujoco and (ep.startswith('gym.envs.mujoco') or ep.startswith('gym.envs.robotics:')): return True try: import atari_py diff --git a/gym/envs/tests/test_mujoco_v2_to_v3_conversion.py b/gym/envs/tests/test_mujoco_v2_to_v3_conversion.py new file mode 100644 index 00000000000..7fbf3637d9c --- /dev/null +++ b/gym/envs/tests/test_mujoco_v2_to_v3_conversion.py @@ -0,0 +1,86 @@ +import unittest +import numpy as np +from gym import envs +from gym.envs.tests.spec_list import skip_mujoco + + +def verify_environments_match(old_environment_id, + new_environment_id, + seed=1, + num_actions=1000): + old_environment = envs.make(old_environment_id) + new_environment = envs.make(new_environment_id) + + old_environment.seed(seed) + new_environment.seed(seed) + + old_reset_observation = old_environment.reset() + new_reset_observation = new_environment.reset() + + np.testing.assert_allclose(old_reset_observation, new_reset_observation) + + for i in range(num_actions): + action = old_environment.action_space.sample() + old_observation, old_reward, old_done, old_info = old_environment.step( + action) + new_observation, new_reward, new_done, new_info = new_environment.step( + action) + + np.testing.assert_allclose(old_observation, new_observation) + np.testing.assert_allclose(old_reward, new_reward) + np.testing.assert_allclose(old_done, new_done) + + for key in old_info: + np.testing.assert_array_equal(old_info[key], new_info[key]) + + +@unittest.skipIf(skip_mujoco, 'Cannot run mujoco key ' + + '(either license key not found or ' + + 'mujoco not installed properly') +class Mujocov2Tov2ConverstionTest(unittest.TestCase): + def test_environments_match(self): + test_cases = ( + { + 'old_id': 'Swimmer-v2', + 'new_id': 'Swimmer-v3' + }, + { + 'old_id': 'Hopper-v2', + 'new_id': 'Hopper-v3' + }, + { + 'old_id': 'Walker2d-v2', + 'new_id': 'Walker2d-v3' + }, + { + 'old_id': 'HalfCheetah-v2', + 'new_id': 'HalfCheetah-v3' + }, + { + 'old_id': 'Ant-v2', + 'new_id': 'Ant-v3' + }, + { + 'old_id': 'Humanoid-v2', + 'new_id': 'Humanoid-v3' + }, + ) + + for test_case in test_cases: + verify_environments_match(test_case['old_id'], test_case['new_id']) + + # Raises KeyError because the new envs have extra info + with self.assertRaises(KeyError): + verify_environments_match('Swimmer-v3', 'Swimmer-v2') + + # Raises KeyError because the new envs have extra info + with self.assertRaises(KeyError): + verify_environments_match('Humanoid-v3', 'Humanoid-v2') + + # Raises KeyError because the new envs have extra info + with self.assertRaises(KeyError): + verify_environments_match('Swimmer-v3', 'Swimmer-v2') + + +if __name__ == '__main__': + unittest.main()