Skip to content

Commit

Permalink
Add fixes to NaoStanding Env (#1625)
Browse files Browse the repository at this point in the history
* Refactor tomato throwing mechanics and reward system in NaoStanding environment

* add newline
  • Loading branch information
oleflb authored Feb 1, 2025
1 parent d95050a commit 9968ef4
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 17 deletions.
4 changes: 2 additions & 2 deletions tools/machine-learning/mujoco/model/tomato.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<mujoco model="tomato">
<worldbody>
<body name="tomato" pos="2 0 0">
<geom type="sphere" size="0.05" mass="0.25" rgba="1 0 0 1" />
<body name="tomato" pos="0 2 0">
<geom type="sphere" size="0.05" mass="0.5" rgba="1 0 0 1" />
<freejoint />
</body>
</worldbody>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from numpy.typing import NDArray
from rewards import (
ConstantReward,
ControlAmplitudePenalty,
HeadOverTorsoPenalty,
RewardComposer,
RewardContext,
Expand Down Expand Up @@ -40,7 +41,7 @@
],
)

HEAD_SET_HEIGHT = 0.51
HEAD_SET_HEIGHT = 0.493


class NaoStanding(NaoBaseEnv, utils.EzPickle):
Expand All @@ -57,44 +58,53 @@ def __init__(
)

self.current_step = 0
self.termination_penalty = 10.0
self.next_throw_at = 500
self.expected_number_of_frames_between_throws = 120
self.rng = np.random.default_rng()

self.reward = (
RewardComposer()
.add(0.05, ConstantReward())
.add(-0.01, TorqueChangeRatePenalty(self.model.nu, self.dt))
.add(1.0, HeadOverTorsoPenalty())
.add(0.02, ConstantReward())
.add(-0.001, TorqueChangeRatePenalty(self.model.nu, self.dt))
.add(-0.001, ControlAmplitudePenalty())
.add(-0.5, HeadOverTorsoPenalty())
)
utils.EzPickle.__init__(self, **kwargs)

def _should_throw_tomato(self) -> bool:
allowed_to_throw = (
self.current_step >= self.next_throw_at
and self.projectile.has_ground_contact()
)
if allowed_to_throw:
self.next_throw_at = self.current_step + self.rng.poisson(
self.expected_number_of_frames_between_throws
)

return allowed_to_throw

@override
def step(self, action: NDArray[np.floating]) -> tuple:
self.current_step += 1

if self.throw_tomatoes and self.projectile.has_ground_contact():
if self.throw_tomatoes and self._should_throw_tomato():
target = self.data.site("Robot").xpos
alpha = self.current_step / 2500
time_to_reach = 0.2 * (1 - alpha) + 0.1 * alpha
time_to_reach = 0.15
self.projectile.random_throw(
target,
time_to_reach=time_to_reach,
distance=1.0,
)

self.do_simulation(action + OFFSET_QPOS, self.frame_skip)
head_center_z = self.data.site("head_center").xpos[2]

if self.render_mode == "human":
self.render()

terminated = head_center_z < 0.3

distinct_rewards = self.reward.rewards(RewardContext(self.nao, action))
reward = sum(distinct_rewards.values())

if terminated:
reward -= self.termination_penalty

terminated = False
self.current_step += 1
return (
self._get_obs(),
Expand All @@ -107,6 +117,8 @@ def step(self, action: NDArray[np.floating]) -> tuple:
@override
def reset_model(self) -> NDArray[np.floating]:
self.current_step = 0
self.next_throw_at = 500
self.reward.reset()
self.set_state(
self.init_qpos,
self.init_qvel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def add(self, factor: float | None, reward: BaseReward) -> Self:
return self

def reward(self, context: RewardContext) -> np.floating:
return np.float32(sum(self._inner_rewards(context).values()))
return np.float32(sum(self.rewards(context).values()))

def rewards(self, context: RewardContext) -> dict[str, np.floating]:
return {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def reward(self, context: RewardContext) -> np.floating:
class TorqueChangeRatePenalty(BaseReward):
def __init__(self, actuator_dimension: int, dt: float) -> None:
self.previous_force = np.zeros(actuator_dimension)
self.is_initialized = False
self.dt = dt

def reward(self, context: RewardContext) -> np.floating:
Expand All @@ -78,8 +79,16 @@ def reward(self, context: RewardContext) -> np.floating:
np.abs(previous_torque - current_torque) / self.dt
)
self.previous_force = np.copy(context.nao.data.actuator_force)

if not self.is_initialized:
self.is_initialized = True
return np.float32(0.0)

return torque_change_rate

def reset(self) -> None:
self.is_initialized = False


class XDistanceReward(BaseReward):
def reward(self, context: RewardContext) -> np.floating:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ def __init__(
)

def has_ground_contact(self) -> bool:
qpos_index = self.model.jnt_qposadr[
self.model.body_jntadr[self.throwable_index]
]
z_height = self.data.qpos[qpos_index + 2]
if z_height <= -1.0:
# clipped through the floor
return True

geoms = (
self.model.body_geomadr[self.throwable_index],
self.model.body_geomadr[self.ground_index],
Expand Down

0 comments on commit 9968ef4

Please sign in to comment.