Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reinforcement learning #1085

Merged
merged 3 commits into from
May 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions examples/reinforcement_learning/tutorial_AC.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@

"""
import argparse
import os
import time
import matplotlib.pyplot as plt
import os

import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

Expand Down Expand Up @@ -78,6 +78,8 @@
LR_A = 0.001 # learning rate for actor
LR_C = 0.01 # learning rate for critic



############################### Actor-Critic ####################################


Expand Down Expand Up @@ -137,12 +139,13 @@ def __init__(self, state_dim, lr=0.01):

self.optimizer = tf.optimizers.Adam(lr)

def learn(self, state, reward, state_):
def learn(self, state, reward, state_, done):
d = 0 if done else 1
v_ = self.model(np.array([state_]))
with tf.GradientTape() as tape:
v = self.model(np.array([state]))
## TD_error = r + lambda * V(newS) - V(S)
td_error = reward + LAM * v_ - v
## TD_error = r + d * lambda * V(newS) - V(S)
td_error = reward + d * LAM * v_ - v
loss = tf.square(td_error)
grad = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
Expand Down Expand Up @@ -203,7 +206,7 @@ def load(self): # load trained weights
state_new, reward, done, info = env.step(action)
state_new = state_new.astype(np.float32)

if done: reward = -20 # reward shaping trick
if done: reward = -20 # reward shaping trick
# these may helpful in some tasks
# if abs(s_new[0]) >= env.observation_space.high[0]:
# # cart moves more than 2.4 units from the center
Expand All @@ -215,7 +218,7 @@ def load(self): # load trained weights

try:
td_error = critic.learn(
state, reward, state_new
state, reward, state_new, done
) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
actor.learn(state, action, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error]
except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
Expand All @@ -238,7 +241,7 @@ def load(self): # load trained weights

# Early Stopping for quick check
if step >= MAX_STEPS:
print("Early Stopping") # Hao Dong: it is important for this task
print("Early Stopping") # Hao Dong: it is important for this task
break
actor.save()
critic.save()
Expand Down
15 changes: 8 additions & 7 deletions examples/reinforcement_learning/tutorial_DPPO.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import tensorflow_probability as tfp

import tensorlayer as tl

parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
Expand Down Expand Up @@ -73,6 +73,7 @@
# ppo-clip parameters
EPSILON = 0.2


############################### DPPO ####################################


Expand Down Expand Up @@ -282,7 +283,10 @@ def work(self):
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
if t == MAX_STEPS - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
# finish patyh
v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0]
if done:
v_s_ = 0
else:
v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0]
discounted_r = [] # compute discounted reward
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
Expand All @@ -304,8 +308,7 @@ def work(self):

print(
'Training | Episode: {}/{} | Worker: {} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r,
time.time() - T0
GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r, time.time() - T0
)
)
# record reward changes, plot later
Expand Down Expand Up @@ -372,6 +375,4 @@ def work(self):
print(
'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, TEST_EPISODES, episode_reward,
time.time() - T0
)
)
time.time() - T0))
25 changes: 12 additions & 13 deletions examples/reinforcement_learning/tutorial_PPO.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import tensorflow_probability as tfp

import tensorlayer as tl

parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
Expand Down Expand Up @@ -63,14 +63,14 @@
# ppo-clip parameters
EPSILON = 0.2


############################### PPO ####################################


class PPO(object):
"""
PPO class
"""

def __init__(self, state_dim, action_dim, action_bound, method='clip'):
# critic
with tf.name_scope('critic'):
Expand Down Expand Up @@ -233,13 +233,16 @@ def store_transition(self, state, action, reward):
self.action_buffer.append(action)
self.reward_buffer.append(reward)

def finish_path(self, next_state):
def finish_path(self, next_state, done):
"""
Calculate cumulative reward
:param next_state:
:return: None
"""
v_s_ = self.critic(np.array([next_state], np.float32))[0, 0]
if done:
v_s_ = 0
else:
v_s_ = self.critic(np.array([next_state], np.float32))[0, 0]
discounted_r = []
for r in self.reward_buffer[::-1]:
v_s_ = r + GAMMA * v_s_
Expand Down Expand Up @@ -280,17 +283,15 @@ def finish_path(self, next_state):
episode_reward += reward

# update ppo
if (step + 1) % BATCH_SIZE == 0:
agent.finish_path(state_)
if len(agent.state_buffer) >= BATCH_SIZE:
agent.finish_path(state_, done)
agent.update()
if done:
break
agent.finish_path(state_)
agent.finish_path(state_, done)
print(
'Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, TRAIN_EPISODES, episode_reward,
time.time() - t0
)
episode + 1, TRAIN_EPISODES, episode_reward, time.time() - t0)
)
if episode == 0:
all_episode_reward.append(episode_reward)
Expand Down Expand Up @@ -318,6 +319,4 @@ def finish_path(self, next_state):
print(
'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, TEST_EPISODES, episode_reward,
time.time() - t0
)
)
time.time() - t0))
4 changes: 2 additions & 2 deletions examples/reinforcement_learning/tutorial_SAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def evaluate(self, state, epsilon=1e-6):
std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow

normal = Normal(0, 1)
z = normal.sample()
z = normal.sample(mean.shape)
action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick
action = self.action_range * action_0
# according to original paper, with an extra last term for normalizing different action range
Expand All @@ -204,7 +204,7 @@ def get_action(self, state, greedy=False):
std = tf.math.exp(log_std)

normal = Normal(0, 1)
z = normal.sample()
z = normal.sample(mean.shape)
action = self.action_range * tf.math.tanh(
mean + std * z
) # TanhNormal distribution as actions; reparameterization trick
Expand Down