上传本地笔记和code和slides

morningsky · Nov 6, 2019 · 0d42960 · 0d42960
commit 0d42960
Show file tree

Hide file tree

Showing 29 changed files with 4,359 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -0,0 +1,12 @@
+# 李宏毅深度强化学习 笔记
+
+### 课程主页：[NTU-MLDS18](http://speech.ee.ntu.edu.tw/~tlkagk/courses_MLDS18.html)
+
+### 视频：
+- [youtube](https://www.youtube.com/playlist?list=PLJV_el3uVTsODxQFgzMzPLa16h6B8kWM_) 
+- [B站](https://www.bilibili.com/video/av24724071/?spm_id_from=333.788.videocard.4)
+
+### 参考资料： 
+[作业代码参考](https://github.com/JasonYao81000/MLDS2018SPRING/tree/master/hw4)  [纯numpy实现非Deep的RL算法](https://github.com/ddbourgin/numpy-ml/tree/master/numpy_ml/rl_models) [OpenAI tutorial](https://github.com/openai/spinningup/tree/master/docs)
+
+这门课的学习路线如上，强化学习是作为单独一个模块介绍。李宏毅老师讲这门课不是从MDP开始讲起，而是从如何获得最大化奖励出发，直接引出Policy Gradient（以及PPO），再讲Q-learning（原始Q-learning，DQN，各种DQN的升级），然后是A2C（以及A3C, DDPG），紧接着介绍了一些Reward Shaping的方法（主要是Curiosity，Curriculum Learning ，Hierarchical Learning），最后介绍Imitation Learning (Inverse RL)。比较全面的展现了深度强化学习的核心内容，也比较直观。跟伯克利学派的课类似，与UCL上来就讲MDP，解各种value iteration的思路有较大区别。
diff --git a/code/.DS_Store b/code/.DS_Store
diff --git a/code/actor_critic_advantage.py b/code/actor_critic_advantage.py
@@ -0,0 +1,175 @@
+import tensorflow as tf
+import numpy as np
+import gym
+
+'''
+比较PG算法
+PG loss = log_prob * v估计 （来自贝尔曼公式）
+A2C loss = log_prob * TD-error（来自critic网络 表达当前动作的价值比平均动作的价值好多少）
+DDPG ： critic不仅能影响actor actor也能影响critic 相当于critic不仅告诉actor的行为好不好，还告诉他应该怎么改进才能更好(传一个梯度 dq/da)
+PPO: 对PG的更新加了限制，提高训练稳定性 相比于A2C 只是actor网络更加复杂
+'''
+class Actor(object): #本质还是policy gradient 不过A2C是单步更新
+    def __init__(self, 
+                 sess, #两个网络需要共用一个session 所以外部初始化
+                 n_actions, 
+                 n_features, 
+                 lr=0.01, ):
+        #self.ep_obs, self.ep_as, self.ep_rs =[],[],[] #由于是单步更新 所以不需要存储每个episode的数据
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.a = tf.placeholder(tf.int32, None, "act") #           
+        self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error更新的幅度 td 的理解应该是 Q(s, a) - V(s), 某个动作价值减去平均动作价值
+
+        with tf.variable_scope('Actor'): #将原来的name_scope换成variable_scope ，可以在一个scope里面共享变量
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=20,    # number of hidden units
+                activation=tf.nn.relu,
+                kernel_initializer=tf.random_normal_initializer(0., .1),    # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.acts_prob = tf.layers.dense(
+                inputs=l1,
+                units=n_actions,    # output units
+                activation=tf.nn.softmax,   # get action probabilities
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='acts_prob'
+            )
+
+        #with tf.name_scope('loss'):
+            # 最大化 总体 reward (log_p * R) 就是在最小化 -(log_p * R), 而 tf 的功能里只有最小化 loss
+            #neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) #加- 变为梯度下降
+            #loss = tf.reduce_mean(neg_log_prob * self.tf_vt)
+        with tf.variable_scope('loss'):
+            log_prob = tf.log(self.acts_prob[0,self.a]) #[[0.1,0.2,0.3]] -> 0.1, if a=0
+            self.loss = log_prob * self.td_error  # advantage (TD_error) guided loss
+
+        with tf.name_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.loss)
+
+    def choose_action(self, s): #选择行为
+        s = s[np.newaxis, :]
+        probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
+        action = np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) 
+        return action  # return a int
+
+
+    def learn(self, s, a, td):
+        s = s[np.newaxis, :]
+        feed_dict = {self.s: s, self.a: a, self.td_error: td}
+        _, loss = self.sess.run([self.train_op, self.loss], feed_dict)
+        return loss
+
+
+class Critic(object):
+    def __init__(self, sess, n_features, lr=0.01, gamma=0.9):
+        self.sess = sess
+
+        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
+        self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
+        self.r = tf.placeholder(tf.float32, None, 'r')
+
+        with tf.variable_scope('Critic'):
+            l1 = tf.layers.dense(
+                inputs=self.s,
+                units=20,  # number of hidden units
+                activation=tf.nn.relu,  # None
+                # have to be linear to make sure the convergence of actor.
+                # But linear approximator seems hardly learns the correct Q.
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='l1'
+            )
+
+            self.v = tf.layers.dense(
+                inputs=l1,
+                units=1,  # output units
+                activation=None,
+                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
+                bias_initializer=tf.constant_initializer(0.1),  # biases
+                name='V'
+            )
+
+        with tf.variable_scope('squared_TD_error'):
+            self.td_error = self.r + gamma * self.v_ - self.v
+            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
+        with tf.variable_scope('train'):
+            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
+
+    def learn(self, s, r, s_):
+        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
+
+        v_ = self.sess.run(self.v, {self.s: s_})
+        td_error, _ = self.sess.run([self.td_error, self.train_op],
+                                        {self.s: s, self.v_: v_, self.r: r})
+        return td_error
+
+np.random.seed(2)
+tf.set_random_seed(2)  # reproducible
+
+# Superparameters
+OUTPUT_GRAPH = False
+MAX_EPISODE = 100#3000
+DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
+MAX_EP_STEPS = 1000   # maximum time step in one episode
+RENDER = False  # rendering wastes time
+GAMMA = 0.9     # reward discount in TD error
+LR_A = 0.01    # learning rate for actor
+LR_C = 0.05     # learning rate for critic
+
+env = gym.make('CartPole-v0')
+env.seed(1)  # reproducible
+env = env.unwrapped
+
+N_F = env.observation_space.shape[0]
+N_A = env.action_space.n
+
+from gym import Space
+
+sess = tf.Session() #两个网络共用一个session
+
+actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
+critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor
+
+sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+    tf.summary.FileWriter("logs/", sess.graph)
+
+for i_episode in range(MAX_EPISODE):
+    state = env.reset()
+    t = 0
+    r_list = []
+
+    while True:
+        if RENDER:
+            env.render()
+        action = actor.choose_action(state)
+        state_, reward, done, info = env.step(action)
+        if done: 
+            reward=-20 #最后一步的奖励 一个trick
+        r_list.append(reward)
+        td_error = critic.learn(state, reward, state_)
+        actor.learn(state, action, td_error)
+        state = state_
+
+        if done or t>= MAX_EP_STEPS:
+            ep_rs_sum = sum(r_list)
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = False  # rendering
+            print("episode:", i_episode, "  reward:", int(running_reward))
+            break
+
+
+
+
+
+
diff --git a/code/ddpg_update.py b/code/ddpg_update.py
@@ -0,0 +1,136 @@
+import tensorflow as tf
+import numpy as np
+import gym
+import time
+
+#####################  hyper parameters  ####################
+
+MAX_EPISODES = 200
+MAX_EP_STEPS = 200
+LR_A = 0.01    # learning rate for actor
+LR_C = 0.02    # learning rate for critic
+GAMMA = 0.9     # reward discount
+TAU = 0.01      # soft replacement
+MEMORY_CAPACITY = 10000
+BATCH_SIZE = 32
+
+RENDER = False
+ENV_NAME = 'Pendulum-v0'
+
+#pendulum 动作与状态都是连续空间
+#动作空间：只有一维力矩 长度为1 虽然是连续值，但是有bound【-2，2】
+#状态空间：一维速度，长度为3
+
+###############################  DDPG  ####################################
+#离线训练 单步更新 按batch更新 引入replay buffer机制
+class DDPG(object):
+    def __init__(self, a_dim, s_dim, a_bound,): #初始化2个网络图 注意无论是critic还是actor网络都有target-network机制 target-network不训练
+        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim +1), dtype=np.float32) #借鉴replay buff机制 s*2 : s, s_
+        self.pointer = 0
+        self.sess = tf.Session()
+
+        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound
+        self.S = tf.placeholder(tf.float32, [None, s_dim], 's') #前面的None用来给batch size占位
+        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
+        self.R = tf.placeholder(tf.float32, [None,1], 'r')
+
+        with tf.variable_scope('Actor'):
+            self.a = self._build_a(self.S, scope='eval', trainable=True) #要训练的pi网络，也负责收集数据 # input s, output a
+            a_ = self._build_a(self.S, scope='target', trainable=False) #target网络不训练，只负责输出动作给critic # input s_, output a, get a_ for critic
+        with tf.variable_scope('Critic'):
+            q = self._build_c(self.S, self.a, scope='eval', trainable=True) #要训练的Q， 与target输出的q算mse（td-error）  注意这个a来自于memory
+            q_ = self._build_c(self.S_, a_, scope='target', trainable=False) #这个网络不训练, 用于给出 Actor 更新参数时的 Gradient ascent 强度 即dq/da 注意这个a来自于actor要更新参数时候的a
+
+        # networks parameters
+        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
+        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
+        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
+        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
+
+        #taget 网络更新 即从eval网络中复制参数
+        self.soft_replace = [tf.assign(t, (1-TAU)*t + TAU *e) for t, e in zip(self.at_params+self.ct_params,self.ae_params+self.ce_params)]
+
+        #训练critic网络（eval）
+        q_target = self.R + GAMMA * q_ #贝尔曼公式（里面的q_来自于Q-target网络输入(s_，a_)的输出） 得出q的”真实值“ 与预测值求mse
+        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) #预测值q 来自于q-eval网络输入当前时刻的(s,a)的输出
+        self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list = self.ce_params) #要train的是q-eval网络的参数 最小化mse
+
+        #训练actor网络（eval）
+        a_loss = -tf.reduce_mean(q) #maximize q
+        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list = self.ae_params) #
+
+        self.sess.run(tf.global_variables_initializer())
+
+
+    def choose_action(self, s):
+        s = s[np.newaxis, :]
+        return self.sess.run(self.a, feed_dict={self.S: s})[0]  # single action
+
+
+    def learn(self):
+        #每次学习都是先更新target网络参数
+        self.sess.run(self.soft_replace)
+        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) 
+        bt = self.memory[indices, : ] #从memory中取一个batch的数据来训练
+        bs = bt[:, :self.s_dim] #a batch of state
+        ba = bt[:, self.s_dim: self.s_dim + self.a_dim] #a batch of action
+        br = bt[:, -self.s_dim - 1: -self.s_dim] #a batch of reward
+        bs_ = bt[:, -self.s_dim:]
+
+        #一次训练一个batch 这一个batch的训练过程中target网络相当于固定不动
+        self.sess.run(self.atrain, {self.S: bs})
+        self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
+
+
+    def store_transition(self, s, a, r, s_): #离线训练算法标准操作
+        transition = np.hstack((s, a, [r], s_))
+        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
+        self.memory[index, :] = transition
+        self.pointer += 1
+
+    def _build_a(self, s, scope, trainable): #actor网络结构 直接输出动作确定a
+        with tf.variable_scope(scope):
+            net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
+            a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) #a经过了tanh 数值缩放到了【-1，1】
+            return tf.multiply(a, self.a_bound, name='scaled_a') #输出的每个a值都乘边界[max,] 可以保证输出范围在【-max，max】 如果最小 最大值不是相反数 得用clip正则化
+
+    def _build_c(self, s, a, scope, trainable): #critic网络结构 输出Q(s,a)
+        with tf.variable_scope(scope):
+            n_l1 = 30
+            w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
+            w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
+            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
+            net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+            return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)
+
+env = gym.make(ENV_NAME)
+env = env.unwrapped
+env.seed(1)
+s_dim = env.observation_space.shape[0]
+a_dim = env.action_space.shape[0]
+a_bound = env.action_space.high
+ddpg = DDPG(a_dim, s_dim, a_bound)
+
+var = 3  # control exploration
+t1 = time.time()
+for i in range(MAX_EPISODES):
+    s = env.reset()
+    ep_reward = 0
+    for j in range(MAX_EP_STEPS): #没有明确停止条件的游戏都需要这么一个
+        if RENDER:
+            env.render()
+        a = ddpg.choose_action(s)
+        a = np.clip(np.random.normal(a, var),-2,2) #增加exploration noise 以actor输出的a为均值，var为方差进行选择a 同时保证a的值在【-2，2】
+        s_, r, done, info = env.step(a)
+
+        ddpg.store_transition(s, a, r/10, s_)
+        if ddpg.pointer > MEMORY_CAPACITY: #存储的数据满了开始训练各个网络
+            var *= 0.9995 #降低动作选择的随机性
+            ddpg.learn() #超过10000才开始训练，每次从经验库中抽取一个batch，每走一步都会执行一次训练 单步更新
+        s = s_
+        ep_reward += r
+        if j == MAX_EP_STEPS-1:
+            print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
+            # if ep_reward > -300:RENDER = True
+            break
+print('Running time: ', time.time() - t1)