Add fixes to NaoStanding Env (#1625)

* Refactor tomato throwing mechanics and reward system in NaoStanding environment * add newline
HULKs · Feb 1, 2025 · 9968ef4 · 9968ef4
1 parent d95050a
commit 9968ef4
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 17 deletions.
diff --git a/tools/machine-learning/mujoco/model/tomato.xml b/tools/machine-learning/mujoco/model/tomato.xml
@@ -1,7 +1,7 @@
 <mujoco model="tomato">
   <worldbody>
-    <body name="tomato" pos="2 0 0">
-      <geom type="sphere" size="0.05" mass="0.25" rgba="1 0 0 1" />
+    <body name="tomato" pos="0 2 0">
+      <geom type="sphere" size="0.05" mass="0.5" rgba="1 0 0 1" />
       <freejoint />
     </body>
   </worldbody>

diff --git a/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py b/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py
@@ -6,6 +6,7 @@
 from numpy.typing import NDArray
 from rewards import (
     ConstantReward,
+    ControlAmplitudePenalty,
     HeadOverTorsoPenalty,
     RewardComposer,
     RewardContext,
@@ -40,7 +41,7 @@
     ],
 )
 
-HEAD_SET_HEIGHT = 0.51
+HEAD_SET_HEIGHT = 0.493
 
 
 class NaoStanding(NaoBaseEnv, utils.EzPickle):
@@ -57,44 +58,53 @@ def __init__(
         )
 
         self.current_step = 0
-        self.termination_penalty = 10.0
+        self.next_throw_at = 500
+        self.expected_number_of_frames_between_throws = 120
+        self.rng = np.random.default_rng()
 
         self.reward = (
             RewardComposer()
-            .add(0.05, ConstantReward())
-            .add(-0.01, TorqueChangeRatePenalty(self.model.nu, self.dt))
-            .add(1.0, HeadOverTorsoPenalty())
+            .add(0.02, ConstantReward())
+            .add(-0.001, TorqueChangeRatePenalty(self.model.nu, self.dt))
+            .add(-0.001, ControlAmplitudePenalty())
+            .add(-0.5, HeadOverTorsoPenalty())
         )
         utils.EzPickle.__init__(self, **kwargs)
 
+    def _should_throw_tomato(self) -> bool:
+        allowed_to_throw = (
+            self.current_step >= self.next_throw_at
+            and self.projectile.has_ground_contact()
+        )
+        if allowed_to_throw:
+            self.next_throw_at = self.current_step + self.rng.poisson(
+                self.expected_number_of_frames_between_throws
+            )
+
+        return allowed_to_throw
+
     @override
     def step(self, action: NDArray[np.floating]) -> tuple:
         self.current_step += 1
 
-        if self.throw_tomatoes and self.projectile.has_ground_contact():
+        if self.throw_tomatoes and self._should_throw_tomato():
             target = self.data.site("Robot").xpos
-            alpha = self.current_step / 2500
-            time_to_reach = 0.2 * (1 - alpha) + 0.1 * alpha
+            time_to_reach = 0.15
             self.projectile.random_throw(
                 target,
                 time_to_reach=time_to_reach,
                 distance=1.0,
             )
 
         self.do_simulation(action + OFFSET_QPOS, self.frame_skip)
-        head_center_z = self.data.site("head_center").xpos[2]
 
         if self.render_mode == "human":
             self.render()
 
-        terminated = head_center_z < 0.3
-
         distinct_rewards = self.reward.rewards(RewardContext(self.nao, action))
         reward = sum(distinct_rewards.values())
 
-        if terminated:
-            reward -= self.termination_penalty
-
+        terminated = False
         self.current_step += 1
         return (
             self._get_obs(),
@@ -107,6 +117,8 @@ def step(self, action: NDArray[np.floating]) -> tuple:
     @override
     def reset_model(self) -> NDArray[np.floating]:
         self.current_step = 0
+        self.next_throw_at = 500
+        self.reward.reset()
         self.set_state(
             self.init_qpos,
             self.init_qvel,

diff --git a/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py b/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py
@@ -16,7 +16,7 @@ def add(self, factor: float | None, reward: BaseReward) -> Self:
         return self
 
     def reward(self, context: RewardContext) -> np.floating:
-        return np.float32(sum(self._inner_rewards(context).values()))
+        return np.float32(sum(self.rewards(context).values()))
 
     def rewards(self, context: RewardContext) -> dict[str, np.floating]:
         return {

diff --git a/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py b/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py
@@ -64,6 +64,7 @@ def reward(self, context: RewardContext) -> np.floating:
 class TorqueChangeRatePenalty(BaseReward):
     def __init__(self, actuator_dimension: int, dt: float) -> None:
         self.previous_force = np.zeros(actuator_dimension)
+        self.is_initialized = False
         self.dt = dt
 
     def reward(self, context: RewardContext) -> np.floating:
@@ -78,8 +79,16 @@ def reward(self, context: RewardContext) -> np.floating:
             np.abs(previous_torque - current_torque) / self.dt
         )
         self.previous_force = np.copy(context.nao.data.actuator_force)
+
+        if not self.is_initialized:
+            self.is_initialized = True
+            return np.float32(0.0)
+
         return torque_change_rate
 
+    def reset(self) -> None:
+        self.is_initialized = False
+
 
 class XDistanceReward(BaseReward):
     def reward(self, context: RewardContext) -> np.floating:

diff --git a/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py b/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py
@@ -66,6 +66,14 @@ def __init__(
         )
 
     def has_ground_contact(self) -> bool:
+        qpos_index = self.model.jnt_qposadr[
+            self.model.body_jntadr[self.throwable_index]
+        ]
+        z_height = self.data.qpos[qpos_index + 2]
+        if z_height <= -1.0:
+            # clipped through the floor
+            return True
+
         geoms = (
             self.model.body_geomadr[self.throwable_index],
             self.model.body_geomadr[self.ground_index],