Skip to content

Commit

Permalink
上传本地笔记和code和slides
Browse files Browse the repository at this point in the history
  • Loading branch information
morningsky committed Nov 6, 2019
0 parents commit 0d42960
Show file tree
Hide file tree
Showing 29 changed files with 4,359 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# 李宏毅深度强化学习 笔记

### 课程主页:[NTU-MLDS18](http://speech.ee.ntu.edu.tw/~tlkagk/courses_MLDS18.html)

### 视频:
- [youtube](https://www.youtube.com/playlist?list=PLJV_el3uVTsODxQFgzMzPLa16h6B8kWM_)
- [B站](https://www.bilibili.com/video/av24724071/?spm_id_from=333.788.videocard.4)

### 参考资料:
[作业代码参考](https://github.com/JasonYao81000/MLDS2018SPRING/tree/master/hw4) [纯numpy实现非Deep的RL算法](https://github.com/ddbourgin/numpy-ml/tree/master/numpy_ml/rl_models) [OpenAI tutorial](https://github.com/openai/spinningup/tree/master/docs)

这门课的学习路线如上,强化学习是作为单独一个模块介绍。李宏毅老师讲这门课不是从MDP开始讲起,而是从如何获得最大化奖励出发,直接引出Policy Gradient(以及PPO),再讲Q-learning(原始Q-learning,DQN,各种DQN的升级),然后是A2C(以及A3C, DDPG),紧接着介绍了一些Reward Shaping的方法(主要是Curiosity,Curriculum Learning ,Hierarchical Learning),最后介绍Imitation Learning (Inverse RL)。比较全面的展现了深度强化学习的核心内容,也比较直观。跟伯克利学派的课类似,与UCL上来就讲MDP,解各种value iteration的思路有较大区别。
Binary file added code/.DS_Store
Binary file not shown.
175 changes: 175 additions & 0 deletions code/actor_critic_advantage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import tensorflow as tf
import numpy as np
import gym

'''
比较PG算法
PG loss = log_prob * v估计 (来自贝尔曼公式)
A2C loss = log_prob * TD-error(来自critic网络 表达当前动作的价值比平均动作的价值好多少)
DDPG : critic不仅能影响actor actor也能影响critic 相当于critic不仅告诉actor的行为好不好,还告诉他应该怎么改进才能更好(传一个梯度 dq/da)
PPO: 对PG的更新加了限制,提高训练稳定性 相比于A2C 只是actor网络更加复杂
'''
class Actor(object): #本质还是policy gradient 不过A2C是单步更新
def __init__(self,
sess, #两个网络需要共用一个session 所以外部初始化
n_actions,
n_features,
lr=0.01, ):
#self.ep_obs, self.ep_as, self.ep_rs =[],[],[] #由于是单步更新 所以不需要存储每个episode的数据
self.sess = sess

self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.a = tf.placeholder(tf.int32, None, "act") #
self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error更新的幅度 td 的理解应该是 Q(s, a) - V(s), 某个动作价值减去平均动作价值

with tf.variable_scope('Actor'): #将原来的name_scope换成variable_scope ,可以在一个scope里面共享变量
l1 = tf.layers.dense(
inputs=self.s,
units=20, # number of hidden units
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)

self.acts_prob = tf.layers.dense(
inputs=l1,
units=n_actions, # output units
activation=tf.nn.softmax, # get action probabilities
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='acts_prob'
)

#with tf.name_scope('loss'):
# 最大化 总体 reward (log_p * R) 就是在最小化 -(log_p * R), 而 tf 的功能里只有最小化 loss
#neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) #加- 变为梯度下降
#loss = tf.reduce_mean(neg_log_prob * self.tf_vt)
with tf.variable_scope('loss'):
log_prob = tf.log(self.acts_prob[0,self.a]) #[[0.1,0.2,0.3]] -> 0.1, if a=0
self.loss = log_prob * self.td_error # advantage (TD_error) guided loss

with tf.name_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.loss)

def choose_action(self, s): #选择行为
s = s[np.newaxis, :]
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
action = np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())
return action # return a int


def learn(self, s, a, td):
s = s[np.newaxis, :]
feed_dict = {self.s: s, self.a: a, self.td_error: td}
_, loss = self.sess.run([self.train_op, self.loss], feed_dict)
return loss


class Critic(object):
def __init__(self, sess, n_features, lr=0.01, gamma=0.9):
self.sess = sess

self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
self.r = tf.placeholder(tf.float32, None, 'r')

with tf.variable_scope('Critic'):
l1 = tf.layers.dense(
inputs=self.s,
units=20, # number of hidden units
activation=tf.nn.relu, # None
# have to be linear to make sure the convergence of actor.
# But linear approximator seems hardly learns the correct Q.
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)

self.v = tf.layers.dense(
inputs=l1,
units=1, # output units
activation=None,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='V'
)

with tf.variable_scope('squared_TD_error'):
self.td_error = self.r + gamma * self.v_ - self.v
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

def learn(self, s, r, s_):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

v_ = self.sess.run(self.v, {self.s: s_})
td_error, _ = self.sess.run([self.td_error, self.train_op],
{self.s: s, self.v_: v_, self.r: r})
return td_error

np.random.seed(2)
tf.set_random_seed(2) # reproducible

# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 100#3000
DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 1000 # maximum time step in one episode
RENDER = False # rendering wastes time
GAMMA = 0.9 # reward discount in TD error
LR_A = 0.01 # learning rate for actor
LR_C = 0.05 # learning rate for critic

env = gym.make('CartPole-v0')
env.seed(1) # reproducible
env = env.unwrapped

N_F = env.observation_space.shape[0]
N_A = env.action_space.n

from gym import Space

sess = tf.Session() #两个网络共用一个session

actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
state = env.reset()
t = 0
r_list = []

while True:
if RENDER:
env.render()
action = actor.choose_action(state)
state_, reward, done, info = env.step(action)
if done:
reward=-20 #最后一步的奖励 一个trick
r_list.append(reward)
td_error = critic.learn(state, reward, state_)
actor.learn(state, action, td_error)
state = state_

if done or t>= MAX_EP_STEPS:
ep_rs_sum = sum(r_list)
if 'running_reward' not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = False # rendering
print("episode:", i_episode, " reward:", int(running_reward))
break






136 changes: 136 additions & 0 deletions code/ddpg_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import tensorflow as tf
import numpy as np
import gym
import time

##################### hyper parameters ####################

MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.01 # learning rate for actor
LR_C = 0.02 # learning rate for critic
GAMMA = 0.9 # reward discount
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

RENDER = False
ENV_NAME = 'Pendulum-v0'

#pendulum 动作与状态都是连续空间
#动作空间:只有一维力矩 长度为1 虽然是连续值,但是有bound【-2,2】
#状态空间:一维速度,长度为3

############################### DDPG ####################################
#离线训练 单步更新 按batch更新 引入replay buffer机制
class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound,): #初始化2个网络图 注意无论是critic还是actor网络都有target-network机制 target-network不训练
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim +1), dtype=np.float32) #借鉴replay buff机制 s*2 : s, s_
self.pointer = 0
self.sess = tf.Session()

self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound
self.S = tf.placeholder(tf.float32, [None, s_dim], 's') #前面的None用来给batch size占位
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
self.R = tf.placeholder(tf.float32, [None,1], 'r')

with tf.variable_scope('Actor'):
self.a = self._build_a(self.S, scope='eval', trainable=True) #要训练的pi网络,也负责收集数据 # input s, output a
a_ = self._build_a(self.S, scope='target', trainable=False) #target网络不训练,只负责输出动作给critic # input s_, output a, get a_ for critic
with tf.variable_scope('Critic'):
q = self._build_c(self.S, self.a, scope='eval', trainable=True) #要训练的Q, 与target输出的q算mse(td-error) 注意这个a来自于memory
q_ = self._build_c(self.S_, a_, scope='target', trainable=False) #这个网络不训练, 用于给出 Actor 更新参数时的 Gradient ascent 强度 即dq/da 注意这个a来自于actor要更新参数时候的a

# networks parameters
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

#taget 网络更新 即从eval网络中复制参数
self.soft_replace = [tf.assign(t, (1-TAU)*t + TAU *e) for t, e in zip(self.at_params+self.ct_params,self.ae_params+self.ce_params)]

#训练critic网络(eval)
q_target = self.R + GAMMA * q_ #贝尔曼公式(里面的q_来自于Q-target网络输入(s_,a_)的输出) 得出q的”真实值“ 与预测值求mse
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) #预测值q 来自于q-eval网络输入当前时刻的(s,a)的输出
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list = self.ce_params) #要train的是q-eval网络的参数 最小化mse

#训练actor网络(eval)
a_loss = -tf.reduce_mean(q) #maximize q
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list = self.ae_params) #

self.sess.run(tf.global_variables_initializer())


def choose_action(self, s):
s = s[np.newaxis, :]
return self.sess.run(self.a, feed_dict={self.S: s})[0] # single action


def learn(self):
#每次学习都是先更新target网络参数
self.sess.run(self.soft_replace)
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
bt = self.memory[indices, : ] #从memory中取一个batch的数据来训练
bs = bt[:, :self.s_dim] #a batch of state
ba = bt[:, self.s_dim: self.s_dim + self.a_dim] #a batch of action
br = bt[:, -self.s_dim - 1: -self.s_dim] #a batch of reward
bs_ = bt[:, -self.s_dim:]

#一次训练一个batch 这一个batch的训练过程中target网络相当于固定不动
self.sess.run(self.atrain, {self.S: bs})
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})


def store_transition(self, s, a, r, s_): #离线训练算法标准操作
transition = np.hstack((s, a, [r], s_))
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
self.memory[index, :] = transition
self.pointer += 1

def _build_a(self, s, scope, trainable): #actor网络结构 直接输出动作确定a
with tf.variable_scope(scope):
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) #a经过了tanh 数值缩放到了【-1,1】
return tf.multiply(a, self.a_bound, name='scaled_a') #输出的每个a值都乘边界[max,] 可以保证输出范围在【-max,max】 如果最小 最大值不是相反数 得用clip正则化

def _build_c(self, s, a, scope, trainable): #critic网络结构 输出Q(s,a)
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)

env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)

var = 3 # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS): #没有明确停止条件的游戏都需要这么一个
if RENDER:
env.render()
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, var),-2,2) #增加exploration noise 以actor输出的a为均值,var为方差进行选择a 同时保证a的值在【-2,2】
s_, r, done, info = env.step(a)

ddpg.store_transition(s, a, r/10, s_)
if ddpg.pointer > MEMORY_CAPACITY: #存储的数据满了开始训练各个网络
var *= 0.9995 #降低动作选择的随机性
ddpg.learn() #超过10000才开始训练,每次从经验库中抽取一个batch,每走一步都会执行一次训练 单步更新
s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
# if ep_reward > -300:RENDER = True
break
print('Running time: ', time.time() - t1)
Loading

0 comments on commit 0d42960

Please sign in to comment.