标签:一句话 iter ror its hidden space 升级 log tps
Policy Gradient 可以直接预测出动作,也可以预测连续动作,但是无法单步更新。
QLearning 先预测出Q值,根据Q值选动作,无法预测连续动作、或者动作种类多的情况,但是可以单步更新。
一句话概括 Actor Critic 方法:
结合了 Policy Gradient (Actor) 和 Function Approximation (Critic) 的方法. Actor
基于概率选行为, Critic
基于 Actor
的行为评判行为的得分, Actor
根据 Critic
的评分修改选行为的概率.
Actor Critic 方法的优势: 可以进行单步更新, 比传统的 Policy Gradient 要快.
Actor Critic 方法的劣势: 取决于 Critic 的价值判断, 但是 Critic 难收敛, 再加上 Actor 的更新, 就更难收敛. 为了解决收敛问题, Google Deepmind 提出了 Actor Critic
升级版 Deep Deterministic Policy Gradient
. 后者融合了 DQN 的优势, 解决了收敛难的问题.
Actor网络的输入(st,at,TDerror)
Actor 网络与policy gradient 差不多,多分类网络,在算loss时候,policy gradient需要乘一个权重Vt,而Vt是根据回报R 累计计算的。
在Actor中,在算loss时候,loss的权重是TDerror
TDerror是Critic网络计算出来的。
Critic网络的输入(st,vt+1,r),输出TDerror
V_eval = network(st)
# TD_error = (r+gamma*V_next) - V_eval
学习的时候输入:(st, r, st+1)
vt+1 = network(st+1)
Critic网络(st,vt+1,r)
1 """ 2 Actor-Critic using TD-error as the Advantage, Reinforcement Learning. 3 4 The cart pole example. Policy is oscillated. 5 6 View more on my tutorial page: https://morvanzhou.github.io/tutorials/ 7 8 Using: 9 tensorflow 1.0 10 gym 0.8.0 11 """ 12 13 import numpy as np 14 import tensorflow as tf 15 import gym 16 17 np.random.seed(2) 18 tf.set_random_seed(2) # reproducible 19 20 # Superparameters 21 OUTPUT_GRAPH = False 22 MAX_EPISODE = 3000 23 DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold 24 MAX_EP_STEPS = 1000 # maximum time step in one episode 25 RENDER = False # rendering wastes time 26 GAMMA = 0.9 # reward discount in TD error 27 LR_A = 0.001 # learning rate for actor 28 LR_C = 0.01 # learning rate for critic 29 30 env = gym.make(‘CartPole-v0‘) 31 env.seed(1) # reproducible 32 env = env.unwrapped 33 34 N_F = env.observation_space.shape[0] 35 N_A = env.action_space.n 36 37 38 class Actor(object): 39 def __init__(self, sess, n_features, n_actions, lr=0.001): 40 self.sess = sess 41 42 self.s = tf.placeholder(tf.float32, [1, n_features], "state") 43 self.a = tf.placeholder(tf.int32, None, "act") 44 self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error 45 46 with tf.variable_scope(‘Actor‘): 47 l1 = tf.layers.dense( 48 inputs=self.s, 49 units=20, # number of hidden units 50 activation=tf.nn.relu, 51 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 52 bias_initializer=tf.constant_initializer(0.1), # biases 53 name=‘l1‘ 54 ) 55 56 self.acts_prob = tf.layers.dense( 57 inputs=l1, 58 units=n_actions, # output units 59 activation=tf.nn.softmax, # get action probabilities 60 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 61 bias_initializer=tf.constant_initializer(0.1), # biases 62 name=‘acts_prob‘ 63 ) 64 65 with tf.variable_scope(‘exp_v‘): 66 log_prob = tf.log(self.acts_prob[0, self.a]) 67 self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss 68 69 with tf.variable_scope(‘train‘): 70 self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) 71 72 def learn(self, s, a, td): 73 s = s[np.newaxis, :] 74 feed_dict = {self.s: s, self.a: a, self.td_error: td} 75 _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict) 76 return exp_v 77 78 def choose_action(self, s): 79 s = s[np.newaxis, :] 80 probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions 81 return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int 82 83 84 class Critic(object): 85 def __init__(self, sess, n_features, lr=0.01): 86 self.sess = sess 87 88 self.s = tf.placeholder(tf.float32, [1, n_features], "state") 89 self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") 90 self.r = tf.placeholder(tf.float32, None, ‘r‘) 91 92 with tf.variable_scope(‘Critic‘): 93 l1 = tf.layers.dense( 94 inputs=self.s, 95 units=20, # number of hidden units 96 activation=tf.nn.relu, # None 97 # have to be linear to make sure the convergence of actor. 98 # But linear approximator seems hardly learns the correct Q. 99 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 100 bias_initializer=tf.constant_initializer(0.1), # biases 101 name=‘l1‘ 102 ) 103 104 self.v = tf.layers.dense( 105 inputs=l1, 106 units=1, # output units 107 activation=None, 108 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 109 bias_initializer=tf.constant_initializer(0.1), # biases 110 name=‘V‘ 111 ) 112 113 with tf.variable_scope(‘squared_TD_error‘): 114 self.td_error = self.r + GAMMA * self.v_ - self.v 115 self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval 116 with tf.variable_scope(‘train‘): 117 self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) 118 119 def learn(self, s, r, s_): 120 s, s_ = s[np.newaxis, :], s_[np.newaxis, :] 121 122 v_ = self.sess.run(self.v, {self.s: s_}) 123 td_error, _ = self.sess.run([self.td_error, self.train_op], 124 {self.s: s, self.v_: v_, self.r: r}) 125 return td_error 126 127 128 sess = tf.Session() 129 130 actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) 131 critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor 132 133 sess.run(tf.global_variables_initializer()) 134 135 if OUTPUT_GRAPH: 136 tf.summary.FileWriter("logs/", sess.graph) 137 138 for i_episode in range(MAX_EPISODE): 139 s = env.reset() 140 t = 0 141 track_r = [] 142 while True: 143 if RENDER: env.render() 144 145 a = actor.choose_action(s) 146 147 s_, r, done, info = env.step(a) 148 149 if done: r = -20 150 151 track_r.append(r) 152 153 td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 154 actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] 155 156 s = s_ 157 t += 1 158 159 if done or t >= MAX_EP_STEPS: 160 ep_rs_sum = sum(track_r) 161 162 if ‘running_reward‘ not in globals(): 163 running_reward = ep_rs_sum 164 else: 165 running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 166 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering 167 print("episode:", i_episode, " reward:", int(running_reward)) 168 break
标签:一句话 iter ror its hidden space 升级 log tps
原文地址:https://www.cnblogs.com/zle1992/p/10243563.html