diff --git a/spinup/algos/pytorch/sac/sac.py b/spinup/algos/pytorch/sac/sac.py index 777c1f38f..8094deead 100644 --- a/spinup/algos/pytorch/sac/sac.py +++ b/spinup/algos/pytorch/sac/sac.py @@ -157,7 +157,7 @@ def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] - # Action limit for clamping: critically, assumes all dimensions share the same bound! + # Action limit for clamping: critically, assumes all dimensions share the same bound! (Attention: correct here if bounds of action dimensions are different) act_limit = env.action_space.high[0] # Create actor-critic module and target networks diff --git a/spinup/backup_gym/gym/envs/classic_control/tworr.py b/spinup/backup_gym/gym/envs/classic_control/tworr.py index 2e006a888..63dfd7139 100644 --- a/spinup/backup_gym/gym/envs/classic_control/tworr.py +++ b/spinup/backup_gym/gym/envs/classic_control/tworr.py @@ -25,7 +25,10 @@ def __init__(self): # TODO: reward params self.lp=100 self.lv=100 - self.reward_eta=0.75 + self.ljerk=100 + self.reward_eta_p=0.7 + self.reward_eta_v=0.15 + self.reward_eta_jerk=0.15 # TODO: User defined linear position gain self.K_p = 10 self.K_i = 5 @@ -43,7 +46,9 @@ def __init__(self): self.state = None self.state_buffer= None self.t = 0 - self.seed() + seed=1 + np.random.seed(seed) + self.seed(seed=seed) self.xd_init = 1.5 self.yd_init = 0.4 @@ -56,11 +61,11 @@ def __init__(self): self.xd = np.linspace(self.xd_init, self.xd_init + deltax, self.MAX_TIMESTEPS, endpoint=True) self.yd = np.linspace(self.yd_init, self.yd_init + deltay, self.MAX_TIMESTEPS, endpoint=True) - # TODO CHECK + # TODO Attention: just the dimension of the observation space is enforced. The data here is not used. If you need to enforce them then modify the code. high_s = np.array([0.2, 0.2, 1.5, 1.5, 2, 2, 18, 5, 2, 2]) low_s = np.array([-0.2, -0.2, -1.5, -1.5, -2, -2, -18, -5, -2, -2]) self.observation_space = spaces.Box(low=low_s, high=high_s, dtype=np.float32) - high_a = np.array([1, 1]) + high_a = np.array([1, 1]) #TODO Attention: limits should be the same otherwise modify sac code low_a = np.array([-1, -1]) self.action_space = spaces.Box(low=low_a, high=high_a, dtype=np.float32) @@ -223,7 +228,6 @@ def seed(self, seed=None): def reset(self): # # randomize true model parameter in every episode # self.LINK_MASS_2_TRUE = 1.1 + np.random.normal(loc=0.0, scale=0.01, size=1) - # at time t=0 self.t = 0 rd_t = np.array([self.xd[self.t], self.yd[self.t]]) @@ -247,12 +251,12 @@ def reset(self): self.qc = qc_t.reshape(1, 2) self.dqc = dqc_t.reshape(1, 2) self.ddqc = ddqc_t.reshape(1, 2) - tau1_hat, tau2_hat = self.two_link_inverse_dynamics(q_t, dqc_t, ddqc_t) + tau1_hat, tau2_hat = self.two_link_inverse_dynamics(qc_t, dqc_t, ddqc_t) # s_init=[th1_init,dth1_init,th2_init,dth2_init] s_init = np.array([q_t[0], dq_t[0], q_t[1], dq_t[1]]) - tau1 = tau1_hat #+ a[0] - tau2 = tau2_hat #+ a[1] + tau1 = tau1_hat #todo check concept here + tau2 = tau2_hat q_FD = self.two_link_forward_dynamics(tau1, tau2, s_init) self.q = np.vstack((self.q, np.array([q_FD[0], q_FD[2]]))) @@ -299,7 +303,7 @@ def step(self,a): self.dqc = np.vstack((self.dqc, dqc_t)) self.ddqc = np.vstack((self.ddqc, ddqc_t)) - tau1_hat, tau2_hat = self.two_link_inverse_dynamics(q_t, dqc_t, ddqc_t) + tau1_hat, tau2_hat = self.two_link_inverse_dynamics(qc_t, dqc_t, ddqc_t) # s_init=[th1_init,dth1_init,th2_init,dth2_init] s_init = np.array([q_t[0], dq_t[0], q_t[1], dq_t[1]]) @@ -349,9 +353,11 @@ def step(self,a): error_p_t = sum(abs(obs[0:2])) v_hat_after = np.array(self.two_link_forward_kinematics(np.array([q_FD[0], q_FD[2]]))) - np.array(r_hat_t) / self.dt error_v_t = sum(abs(v_hat_after-vd_t)) + jerk_level_t = np.abs(self.state_buffer[-1,6]-self.state_buffer[-2,6])+np.abs(self.state_buffer[-1,7]-self.state_buffer[-2,7]) + reward_jerk_t = self.f_logistic(jerk_level_t,self.ljerk) reward_p_t=self.f_logistic(error_p_t,self.lp) reward_v_t = self.f_logistic(error_v_t, self.lv) - reward_t=self.reward_eta*reward_p_t+(1-self.reward_eta)*reward_v_t + reward_t=self.reward_eta_p*reward_p_t+self.reward_eta_v*reward_v_t+self.reward_eta_jerk*reward_jerk_t # given action it returns 4-tuple (observation, reward, done, info) return (self._get_ob(), reward_t, terminal, {}) diff --git a/spinup/examples/pytorch/inverted_pendulum_SAC.py b/spinup/examples/pytorch/inverted_pendulum_SAC.py index 693b5079e..4476ab285 100644 --- a/spinup/examples/pytorch/inverted_pendulum_SAC.py +++ b/spinup/examples/pytorch/inverted_pendulum_SAC.py @@ -8,9 +8,9 @@ from spinup.utils.test_policy import load_policy_and_env, run_policy import os -TRAIN=False +TRAIN=True env_fn = lambda: gym.make('Tworr-v0') -exp_name = "Tworrv0_25" #"no_SAC_0_9" +exp_name = "Tworrv0_27" #"no_SAC_0_9" if __name__ == '__main__': if TRAIN: # train @@ -26,5 +26,6 @@ # from gym.wrappers.monitoring.video_recorder import VideoRecorder # VideoRecorder(env_fn,'/home/mahdi/ETHZ/codes/spinningup/spinup/examples/pytorch/logs/video.mp4', enabled=True) # visualize output - env, get_action = load_policy_and_env('/home/mahdi/ETHZ/codes/spinningup/spinup/examples/pytorch/logs/'+exp_name) - run_policy(env, get_action,num_episodes=1) + output_dir='/home/mahdi/ETHZ/codes/spinningup/spinup/examples/pytorch/logs/'+exp_name + env, get_action = load_policy_and_env(output_dir) + run_policy(env, get_action,num_episodes=1, output_dir=output_dir) diff --git a/spinup/utils/test_policy.py b/spinup/utils/test_policy.py index 30d2c2d99..2f1d35d57 100644 --- a/spinup/utils/test_policy.py +++ b/spinup/utils/test_policy.py @@ -108,7 +108,7 @@ def get_action(x): return get_action -def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): +def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True, output_dir=""): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ @@ -144,7 +144,7 @@ def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): plt.xlabel("x") plt.ylabel("y") plt.legend() - plt.savefig("/home/mahdi/ETHZ/codes/spinningup/spinup/examples/pytorch/logs/Tworrv0_25/position.pdf",format="pdf", bbox_inches='tight') + plt.savefig(output_dir+"/position.pdf",format="pdf", bbox_inches='tight') plt.show() plt.figure(2) @@ -158,7 +158,7 @@ def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): plt.xlabel("vx") plt.ylabel("vy") plt.legend() - plt.savefig("/home/mahdi/ETHZ/codes/spinningup/spinup/examples/pytorch/logs/Tworrv0_25/velocity.pdf",format="pdf", bbox_inches='tight') + plt.savefig(output_dir+"/velocity.pdf",format="pdf", bbox_inches='tight') plt.show()