QLearning python实现

时间：2019-01-08 23:41:26 阅读：668 评论：0 收藏：0 [点我收藏+]
标签：cat view structure ack stop rms and color output
技术分享图片
  1 """
  2 This part of code is the Deep Q Network (DQN) brain.
  3 
  4 view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification
  5 
  6 View more on my tutorial page: https://morvanzhou.github.io/tutorials/
  7 
  8 Using:
  9 Tensorflow: r1.2
 10 """
 11 
 12 import numpy as np
 13 import tensorflow as tf
 14 
 15 np.random.seed(1)
 16 tf.set_random_seed(1)
 17 
 18 
 19 # Deep Q Network off-policy
 20 class DeepQNetwork:
 21     def __init__(
 22             self,
 23             n_actions,
 24             n_features,
 25             learning_rate=0.01,
 26             reward_decay=0.9,
 27             e_greedy=0.9,
 28             replace_target_iter=300,
 29             memory_size=500,
 30             batch_size=32,
 31             e_greedy_increment=None,
 32             output_graph=False,
 33     ):
 34         self.n_actions = n_actions
 35         self.n_features = n_features
 36         self.lr = learning_rate
 37         self.gamma = reward_decay
 38         self.epsilon_max = e_greedy
 39         self.replace_target_iter = replace_target_iter
 40         self.memory_size = memory_size
 41         self.batch_size = batch_size
 42         self.epsilon_increment = e_greedy_increment
 43         self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
 44 
 45         # total learning step
 46         self.learn_step_counter = 0
 47 
 48         # initialize zero memory [s, a, r, s_]
 49         self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
 50 
 51         # consist of [target_net, evaluate_net]
 52         self._build_net()
 53 
 54         t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘target_net‘)
 55         e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘eval_net‘)
 56 
 57         with tf.variable_scope(‘hard_replacement‘):
 58             self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
 59 
 60         self.sess = tf.Session()
 61 
 62         if output_graph:
 63             # $ tensorboard --logdir=logs
 64             tf.summary.FileWriter("logs/", self.sess.graph)
 65 
 66         self.sess.run(tf.global_variables_initializer())
 67         self.cost_his = []
 68 
 69     def _build_net(self):
 70         # ------------------ all inputs ------------------------
 71         self.s = tf.placeholder(tf.float32, [None, self.n_features], name=‘s‘)  # input State
 72         self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name=‘s_‘)  # input Next State
 73         self.r = tf.placeholder(tf.float32, [None, ], name=‘r‘)  # input Reward
 74         self.a = tf.placeholder(tf.int32, [None, ], name=‘a‘)  # input Action
 75 
 76         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
 77 
 78         # ------------------ build evaluate_net ------------------
 79         with tf.variable_scope(‘eval_net‘):
 80             e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
 81                                  bias_initializer=b_initializer, name=‘e1‘)
 82             self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
 83                                           bias_initializer=b_initializer, name=‘q‘)
 84 
 85         # ------------------ build target_net ------------------
 86         with tf.variable_scope(‘target_net‘):
 87             t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
 88                                  bias_initializer=b_initializer, name=‘t1‘)
 89             self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
 90                                           bias_initializer=b_initializer, name=‘t2‘)
 91 
 92         with tf.variable_scope(‘q_target‘):
 93             q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name=‘Qmax_s_‘)    # shape=(None, )
 94             self.q_target = tf.stop_gradient(q_target)
 95         with tf.variable_scope(‘q_eval‘):
 96             a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
 97             self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)    # shape=(None, )
 98         with tf.variable_scope(‘loss‘):
 99             self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name=‘TD_error‘))
100         with tf.variable_scope(‘train‘):
101             self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
102 
103     def store_transition(self, s, a, r, s_):
104         if not hasattr(self, ‘memory_counter‘):
105             self.memory_counter = 0
106         transition = np.hstack((s, [a, r], s_))
107         # replace the old memory with new memory
108         index = self.memory_counter % self.memory_size
109         self.memory[index, :] = transition
110         self.memory_counter += 1
111 
112     def choose_action(self, observation):
113         # to have batch dimension when feed into tf placeholder
114         observation = observation[np.newaxis, :]
115 
116         if np.random.uniform() < self.epsilon:
117             # forward feed the observation and get q value for every actions
118             actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
119             action = np.argmax(actions_value)
120         else:
121             action = np.random.randint(0, self.n_actions)
122         return action
123 
124     def learn(self):
125         # check to replace target parameters
126         if self.learn_step_counter % self.replace_target_iter == 0:
127             self.sess.run(self.target_replace_op)
128             print(‘\ntarget_params_replaced\n‘)
129 
130         # sample batch memory from all memory
131         if self.memory_counter > self.memory_size:
132             sample_index = np.random.choice(self.memory_size, size=self.batch_size)
133         else:
134             sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
135         batch_memory = self.memory[sample_index, :]
136 
137         _, cost = self.sess.run(
138             [self._train_op, self.loss],
139             feed_dict={
140                 self.s: batch_memory[:, :self.n_features],
141                 self.a: batch_memory[:, self.n_features],
142                 self.r: batch_memory[:, self.n_features + 1],
143                 self.s_: batch_memory[:, -self.n_features:],
144             })
145 
146         self.cost_his.append(cost)
147 
148         # increasing epsilon
149         self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
150         self.learn_step_counter += 1
151 
152     def plot_cost(self):
153         import matplotlib.pyplot as plt
154         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
155         plt.ylabel(‘Cost‘)
156         plt.xlabel(‘training steps‘)
157         plt.show()
158 
159 if __name__ == ‘__main__‘:
160     DQN = DeepQNetwork(3,4, output_graph=True)
QLearning python实现
标签：cat view structure ack stop rms and color output
原文地址：https://www.cnblogs.com/zle1992/p/10241794.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行