-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 0d42960
Showing
29 changed files
with
4,359 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# 李宏毅深度强化学习 笔记 | ||
|
||
### 课程主页:[NTU-MLDS18](http://speech.ee.ntu.edu.tw/~tlkagk/courses_MLDS18.html) | ||
|
||
### 视频: | ||
- [youtube](https://www.youtube.com/playlist?list=PLJV_el3uVTsODxQFgzMzPLa16h6B8kWM_) | ||
- [B站](https://www.bilibili.com/video/av24724071/?spm_id_from=333.788.videocard.4) | ||
|
||
### 参考资料: | ||
[作业代码参考](https://github.com/JasonYao81000/MLDS2018SPRING/tree/master/hw4) [纯numpy实现非Deep的RL算法](https://github.com/ddbourgin/numpy-ml/tree/master/numpy_ml/rl_models) [OpenAI tutorial](https://github.com/openai/spinningup/tree/master/docs) | ||
|
||
这门课的学习路线如上,强化学习是作为单独一个模块介绍。李宏毅老师讲这门课不是从MDP开始讲起,而是从如何获得最大化奖励出发,直接引出Policy Gradient(以及PPO),再讲Q-learning(原始Q-learning,DQN,各种DQN的升级),然后是A2C(以及A3C, DDPG),紧接着介绍了一些Reward Shaping的方法(主要是Curiosity,Curriculum Learning ,Hierarchical Learning),最后介绍Imitation Learning (Inverse RL)。比较全面的展现了深度强化学习的核心内容,也比较直观。跟伯克利学派的课类似,与UCL上来就讲MDP,解各种value iteration的思路有较大区别。 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
import gym | ||
|
||
''' | ||
比较PG算法 | ||
PG loss = log_prob * v估计 (来自贝尔曼公式) | ||
A2C loss = log_prob * TD-error(来自critic网络 表达当前动作的价值比平均动作的价值好多少) | ||
DDPG : critic不仅能影响actor actor也能影响critic 相当于critic不仅告诉actor的行为好不好,还告诉他应该怎么改进才能更好(传一个梯度 dq/da) | ||
PPO: 对PG的更新加了限制,提高训练稳定性 相比于A2C 只是actor网络更加复杂 | ||
''' | ||
class Actor(object): #本质还是policy gradient 不过A2C是单步更新 | ||
def __init__(self, | ||
sess, #两个网络需要共用一个session 所以外部初始化 | ||
n_actions, | ||
n_features, | ||
lr=0.01, ): | ||
#self.ep_obs, self.ep_as, self.ep_rs =[],[],[] #由于是单步更新 所以不需要存储每个episode的数据 | ||
self.sess = sess | ||
|
||
self.s = tf.placeholder(tf.float32, [1, n_features], "state") | ||
self.a = tf.placeholder(tf.int32, None, "act") # | ||
self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error更新的幅度 td 的理解应该是 Q(s, a) - V(s), 某个动作价值减去平均动作价值 | ||
|
||
with tf.variable_scope('Actor'): #将原来的name_scope换成variable_scope ,可以在一个scope里面共享变量 | ||
l1 = tf.layers.dense( | ||
inputs=self.s, | ||
units=20, # number of hidden units | ||
activation=tf.nn.relu, | ||
kernel_initializer=tf.random_normal_initializer(0., .1), # weights | ||
bias_initializer=tf.constant_initializer(0.1), # biases | ||
name='l1' | ||
) | ||
|
||
self.acts_prob = tf.layers.dense( | ||
inputs=l1, | ||
units=n_actions, # output units | ||
activation=tf.nn.softmax, # get action probabilities | ||
kernel_initializer=tf.random_normal_initializer(0., .1), # weights | ||
bias_initializer=tf.constant_initializer(0.1), # biases | ||
name='acts_prob' | ||
) | ||
|
||
#with tf.name_scope('loss'): | ||
# 最大化 总体 reward (log_p * R) 就是在最小化 -(log_p * R), 而 tf 的功能里只有最小化 loss | ||
#neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) #加- 变为梯度下降 | ||
#loss = tf.reduce_mean(neg_log_prob * self.tf_vt) | ||
with tf.variable_scope('loss'): | ||
log_prob = tf.log(self.acts_prob[0,self.a]) #[[0.1,0.2,0.3]] -> 0.1, if a=0 | ||
self.loss = log_prob * self.td_error # advantage (TD_error) guided loss | ||
|
||
with tf.name_scope('train'): | ||
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.loss) | ||
|
||
def choose_action(self, s): #选择行为 | ||
s = s[np.newaxis, :] | ||
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions | ||
action = np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) | ||
return action # return a int | ||
|
||
|
||
def learn(self, s, a, td): | ||
s = s[np.newaxis, :] | ||
feed_dict = {self.s: s, self.a: a, self.td_error: td} | ||
_, loss = self.sess.run([self.train_op, self.loss], feed_dict) | ||
return loss | ||
|
||
|
||
class Critic(object): | ||
def __init__(self, sess, n_features, lr=0.01, gamma=0.9): | ||
self.sess = sess | ||
|
||
self.s = tf.placeholder(tf.float32, [1, n_features], "state") | ||
self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") | ||
self.r = tf.placeholder(tf.float32, None, 'r') | ||
|
||
with tf.variable_scope('Critic'): | ||
l1 = tf.layers.dense( | ||
inputs=self.s, | ||
units=20, # number of hidden units | ||
activation=tf.nn.relu, # None | ||
# have to be linear to make sure the convergence of actor. | ||
# But linear approximator seems hardly learns the correct Q. | ||
kernel_initializer=tf.random_normal_initializer(0., .1), # weights | ||
bias_initializer=tf.constant_initializer(0.1), # biases | ||
name='l1' | ||
) | ||
|
||
self.v = tf.layers.dense( | ||
inputs=l1, | ||
units=1, # output units | ||
activation=None, | ||
kernel_initializer=tf.random_normal_initializer(0., .1), # weights | ||
bias_initializer=tf.constant_initializer(0.1), # biases | ||
name='V' | ||
) | ||
|
||
with tf.variable_scope('squared_TD_error'): | ||
self.td_error = self.r + gamma * self.v_ - self.v | ||
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval | ||
with tf.variable_scope('train'): | ||
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) | ||
|
||
def learn(self, s, r, s_): | ||
s, s_ = s[np.newaxis, :], s_[np.newaxis, :] | ||
|
||
v_ = self.sess.run(self.v, {self.s: s_}) | ||
td_error, _ = self.sess.run([self.td_error, self.train_op], | ||
{self.s: s, self.v_: v_, self.r: r}) | ||
return td_error | ||
|
||
np.random.seed(2) | ||
tf.set_random_seed(2) # reproducible | ||
|
||
# Superparameters | ||
OUTPUT_GRAPH = False | ||
MAX_EPISODE = 100#3000 | ||
DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold | ||
MAX_EP_STEPS = 1000 # maximum time step in one episode | ||
RENDER = False # rendering wastes time | ||
GAMMA = 0.9 # reward discount in TD error | ||
LR_A = 0.01 # learning rate for actor | ||
LR_C = 0.05 # learning rate for critic | ||
|
||
env = gym.make('CartPole-v0') | ||
env.seed(1) # reproducible | ||
env = env.unwrapped | ||
|
||
N_F = env.observation_space.shape[0] | ||
N_A = env.action_space.n | ||
|
||
from gym import Space | ||
|
||
sess = tf.Session() #两个网络共用一个session | ||
|
||
actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) | ||
critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor | ||
|
||
sess.run(tf.global_variables_initializer()) | ||
|
||
if OUTPUT_GRAPH: | ||
tf.summary.FileWriter("logs/", sess.graph) | ||
|
||
for i_episode in range(MAX_EPISODE): | ||
state = env.reset() | ||
t = 0 | ||
r_list = [] | ||
|
||
while True: | ||
if RENDER: | ||
env.render() | ||
action = actor.choose_action(state) | ||
state_, reward, done, info = env.step(action) | ||
if done: | ||
reward=-20 #最后一步的奖励 一个trick | ||
r_list.append(reward) | ||
td_error = critic.learn(state, reward, state_) | ||
actor.learn(state, action, td_error) | ||
state = state_ | ||
|
||
if done or t>= MAX_EP_STEPS: | ||
ep_rs_sum = sum(r_list) | ||
if 'running_reward' not in globals(): | ||
running_reward = ep_rs_sum | ||
else: | ||
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 | ||
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = False # rendering | ||
print("episode:", i_episode, " reward:", int(running_reward)) | ||
break | ||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
import gym | ||
import time | ||
|
||
##################### hyper parameters #################### | ||
|
||
MAX_EPISODES = 200 | ||
MAX_EP_STEPS = 200 | ||
LR_A = 0.01 # learning rate for actor | ||
LR_C = 0.02 # learning rate for critic | ||
GAMMA = 0.9 # reward discount | ||
TAU = 0.01 # soft replacement | ||
MEMORY_CAPACITY = 10000 | ||
BATCH_SIZE = 32 | ||
|
||
RENDER = False | ||
ENV_NAME = 'Pendulum-v0' | ||
|
||
#pendulum 动作与状态都是连续空间 | ||
#动作空间:只有一维力矩 长度为1 虽然是连续值,但是有bound【-2,2】 | ||
#状态空间:一维速度,长度为3 | ||
|
||
############################### DDPG #################################### | ||
#离线训练 单步更新 按batch更新 引入replay buffer机制 | ||
class DDPG(object): | ||
def __init__(self, a_dim, s_dim, a_bound,): #初始化2个网络图 注意无论是critic还是actor网络都有target-network机制 target-network不训练 | ||
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim +1), dtype=np.float32) #借鉴replay buff机制 s*2 : s, s_ | ||
self.pointer = 0 | ||
self.sess = tf.Session() | ||
|
||
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound | ||
self.S = tf.placeholder(tf.float32, [None, s_dim], 's') #前面的None用来给batch size占位 | ||
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') | ||
self.R = tf.placeholder(tf.float32, [None,1], 'r') | ||
|
||
with tf.variable_scope('Actor'): | ||
self.a = self._build_a(self.S, scope='eval', trainable=True) #要训练的pi网络,也负责收集数据 # input s, output a | ||
a_ = self._build_a(self.S, scope='target', trainable=False) #target网络不训练,只负责输出动作给critic # input s_, output a, get a_ for critic | ||
with tf.variable_scope('Critic'): | ||
q = self._build_c(self.S, self.a, scope='eval', trainable=True) #要训练的Q, 与target输出的q算mse(td-error) 注意这个a来自于memory | ||
q_ = self._build_c(self.S_, a_, scope='target', trainable=False) #这个网络不训练, 用于给出 Actor 更新参数时的 Gradient ascent 强度 即dq/da 注意这个a来自于actor要更新参数时候的a | ||
|
||
# networks parameters | ||
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') | ||
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') | ||
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval') | ||
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target') | ||
|
||
#taget 网络更新 即从eval网络中复制参数 | ||
self.soft_replace = [tf.assign(t, (1-TAU)*t + TAU *e) for t, e in zip(self.at_params+self.ct_params,self.ae_params+self.ce_params)] | ||
|
||
#训练critic网络(eval) | ||
q_target = self.R + GAMMA * q_ #贝尔曼公式(里面的q_来自于Q-target网络输入(s_,a_)的输出) 得出q的”真实值“ 与预测值求mse | ||
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) #预测值q 来自于q-eval网络输入当前时刻的(s,a)的输出 | ||
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list = self.ce_params) #要train的是q-eval网络的参数 最小化mse | ||
|
||
#训练actor网络(eval) | ||
a_loss = -tf.reduce_mean(q) #maximize q | ||
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list = self.ae_params) # | ||
|
||
self.sess.run(tf.global_variables_initializer()) | ||
|
||
|
||
def choose_action(self, s): | ||
s = s[np.newaxis, :] | ||
return self.sess.run(self.a, feed_dict={self.S: s})[0] # single action | ||
|
||
|
||
def learn(self): | ||
#每次学习都是先更新target网络参数 | ||
self.sess.run(self.soft_replace) | ||
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) | ||
bt = self.memory[indices, : ] #从memory中取一个batch的数据来训练 | ||
bs = bt[:, :self.s_dim] #a batch of state | ||
ba = bt[:, self.s_dim: self.s_dim + self.a_dim] #a batch of action | ||
br = bt[:, -self.s_dim - 1: -self.s_dim] #a batch of reward | ||
bs_ = bt[:, -self.s_dim:] | ||
|
||
#一次训练一个batch 这一个batch的训练过程中target网络相当于固定不动 | ||
self.sess.run(self.atrain, {self.S: bs}) | ||
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_}) | ||
|
||
|
||
def store_transition(self, s, a, r, s_): #离线训练算法标准操作 | ||
transition = np.hstack((s, a, [r], s_)) | ||
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory | ||
self.memory[index, :] = transition | ||
self.pointer += 1 | ||
|
||
def _build_a(self, s, scope, trainable): #actor网络结构 直接输出动作确定a | ||
with tf.variable_scope(scope): | ||
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable) | ||
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) #a经过了tanh 数值缩放到了【-1,1】 | ||
return tf.multiply(a, self.a_bound, name='scaled_a') #输出的每个a值都乘边界[max,] 可以保证输出范围在【-max,max】 如果最小 最大值不是相反数 得用clip正则化 | ||
|
||
def _build_c(self, s, a, scope, trainable): #critic网络结构 输出Q(s,a) | ||
with tf.variable_scope(scope): | ||
n_l1 = 30 | ||
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable) | ||
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable) | ||
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) | ||
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) | ||
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a) | ||
|
||
env = gym.make(ENV_NAME) | ||
env = env.unwrapped | ||
env.seed(1) | ||
s_dim = env.observation_space.shape[0] | ||
a_dim = env.action_space.shape[0] | ||
a_bound = env.action_space.high | ||
ddpg = DDPG(a_dim, s_dim, a_bound) | ||
|
||
var = 3 # control exploration | ||
t1 = time.time() | ||
for i in range(MAX_EPISODES): | ||
s = env.reset() | ||
ep_reward = 0 | ||
for j in range(MAX_EP_STEPS): #没有明确停止条件的游戏都需要这么一个 | ||
if RENDER: | ||
env.render() | ||
a = ddpg.choose_action(s) | ||
a = np.clip(np.random.normal(a, var),-2,2) #增加exploration noise 以actor输出的a为均值,var为方差进行选择a 同时保证a的值在【-2,2】 | ||
s_, r, done, info = env.step(a) | ||
|
||
ddpg.store_transition(s, a, r/10, s_) | ||
if ddpg.pointer > MEMORY_CAPACITY: #存储的数据满了开始训练各个网络 | ||
var *= 0.9995 #降低动作选择的随机性 | ||
ddpg.learn() #超过10000才开始训练,每次从经验库中抽取一个batch,每走一步都会执行一次训练 单步更新 | ||
s = s_ | ||
ep_reward += r | ||
if j == MAX_EP_STEPS-1: | ||
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, ) | ||
# if ep_reward > -300:RENDER = True | ||
break | ||
print('Running time: ', time.time() - t1) |
Oops, something went wrong.