Policy Gradient 可以直接预测出动作,也可以预测连续动作,但是无法单步更新。
QLearning 先预测出Q值,根据Q值选动作,无法预测连续动作、或者动作种类多的情况,但是可以单步更新。
一句话概括 Actor Critic 方法:
结合了 Policy Gradient (Actor) 和 Function Approximation (Critic) 的方法. Actor 基于概率选行为, Critic 基于 Actor 的行为评判行为的得分, Actor 根据 Critic 的评分修改选行为的概率.
Actor Critic 方法的优势: 可以进行单步更新, 比传统的 Policy Gradient 要快.
Actor Critic 方法的劣势: 取决于 Critic 的价值判断, 但是 Critic 难收敛, 再加上 Actor 的更新, 就更难收敛. 为了解决收敛问题, Google Deepmind 提出了 Actor Critic 升级版 Deep Deterministic Policy Gradient. 后者融合了 DQN 的优势, 解决了收敛难的问题.
Actor网络的输入(st,at,TDerror)
Actor 网络与policy gradient 差不多,多分类网络,在算loss时候,policy gradient需要乘一个权重Vt,而Vt是根据回报R 累计计算的。
在Actor中,在算loss时候,loss的权重是TDerror
TDerror是Critic网络计算出来的。
Critic网络的输入(st,vt+1,r),输出TDerror
V_eval = network(st)
# TD_error = (r+gamma*V_next) - V_eval
学习的时候输入:(st, r, st+1)
vt+1 = network(st+1)
Critic网络(st,vt+1,r)
1 """ 2 Actor-Critic using TD-error as the Advantage, Reinforcement Learning. 3 4 The cart pole example. Policy is oscillated. 5 6 View more on my tutorial page: https://morvanzhou.github.io/tutorials/ 7 8 Using: 9 tensorflow 1.0 10 gym 0.8.0 11 """ 12 13 import numpy as np 14 import tensorflow as tf 15 import gym 16 17 np.random.seed(2) 18 tf.set_random_seed(2) # reproducible 19 20 # Superparameters 21 OUTPUT_GRAPH = False 22 MAX_EPISODE = 3000 23 DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold 24 MAX_EP_STEPS = 1000 # maximum time step in one episode 25 RENDER = False # rendering wastes time 26 GAMMA = 0.9 # reward discount in TD error 27 LR_A = 0.001 # learning rate for actor 28 LR_C = 0.01 # learning rate for critic 29 30 env = gym.make(‘CartPole-v0‘) 31 env.seed(1) # reproducible 32 env = env.unwrapped 33 34 N_F = env.observation_space.shape[0] 35 N_A = env.action_space.n 36 37 38 class Actor(object): 39 def __init__(self, sess, n_features, n_actions, lr=0.001): 40 self.sess = sess 41 42 self.s = tf.placeholder(tf.float32, [1, n_features], "state") 43 self.a = tf.placeholder(tf.int32, None, "act") 44 self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error 45 46 with tf.variable_scope(‘Actor‘): 47 l1 = tf.layers.dense( 48 inputs=self.s, 49 units=20, # number of hidden units 50 activation=tf.nn.relu, 51 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 52 bias_initializer=tf.constant_initializer(0.1), # biases 53 name=‘l1‘ 54 ) 55 56 self.acts_prob = tf.layers.dense( 57 inputs=l1, 58 units=n_actions, # output units 59 activation=tf.nn.softmax, # get action probabilities 60 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 61 bias_initializer=tf.constant_initializer(0.1), # biases 62 name=‘acts_prob‘ 63 ) 64 65 with tf.variable_scope(‘exp_v‘): 66 log_prob = tf.log(self.acts_prob[0, self.a]) 67 self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss 68 69 with tf.variable_scope(‘train‘): 70 self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) 71 72 def learn(self, s, a, td): 73 s = s[np.newaxis, :] 74 feed_dict = {self.s: s, self.a: a, self.td_error: td} 75 _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict) 76 return exp_v 77 78 def choose_action(self, s): 79 s = s[np.newaxis, :] 80 probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions 81 return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int 82 83 84 class Critic(object): 85 def __init__(self, sess, n_features, lr=0.01): 86 self.sess = sess 87 88 self.s = tf.placeholder(tf.float32, [1, n_features], "state") 89 self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") 90 self.r = tf.placeholder(tf.float32, None, ‘r‘) 91 92 with tf.variable_scope(‘Critic‘): 93 l1 = tf.layers.dense( 94 inputs=self.s, 95 units=20, # number of hidden units 96 activation=tf.nn.relu, # None 97 # have to be linear to make sure the convergence of actor. 98 # But linear approximator seems hardly learns the correct Q. 99 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 100 bias_initializer=tf.constant_initializer(0.1), # biases 101 name=‘l1‘ 102 ) 103 104 self.v = tf.layers.dense( 105 inputs=l1, 106 units=1, # output units 107 activation=None, 108 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 109 bias_initializer=tf.constant_initializer(0.1), # biases 110 name=‘V‘ 111 ) 112 113 with tf.variable_scope(‘squared_TD_error‘): 114 self.td_error = self.r + GAMMA * self.v_ - self.v 115 self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval 116 with tf.variable_scope(‘train‘): 117 self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) 118 119 def learn(self, s, r, s_): 120 s, s_ = s[np.newaxis, :], s_[np.newaxis, :] 121 122 v_ = self.sess.run(self.v, {self.s: s_}) 123 td_error, _ = self.sess.run([self.td_error, self.train_op], 124 {self.s: s, self.v_: v_, self.r: r}) 125 return td_error 126 127 128 sess = tf.Session() 129 130 actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) 131 critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor 132 133 sess.run(tf.global_variables_initializer()) 134 135 if OUTPUT_GRAPH: 136 tf.summary.FileWriter("logs/", sess.graph) 137 138 for i_episode in range(MAX_EPISODE): 139 s = env.reset() 140 t = 0 141 track_r = [] 142 while True: 143 if RENDER: env.render() 144 145 a = actor.choose_action(s) 146 147 s_, r, done, info = env.step(a) 148 149 if done: r = -20 150 151 track_r.append(r) 152 153 td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 154 actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] 155 156 s = s_ 157 t += 1 158 159 if done or t >= MAX_EP_STEPS: 160 ep_rs_sum = sum(track_r) 161 162 if ‘running_reward‘ not in globals(): 163 running_reward = ep_rs_sum 164 else: 165 running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 166 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering 167 print("episode:", i_episode, " reward:", int(running_reward)) 168 break
原文:https://www.cnblogs.com/zle1992/p/10243563.html