007_001_lab_q_net_cartpole.py.html
import numpy as np
import tensorflow as tf
from collections import deque
import gym
env = gym.make('CartPole-v0')
# Constants defining our neural network
learning_rate = 1e-1
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
X = tf.placeholder(tf.float32, [None, input_size], name="input_x")
# First layer of weights
W1 = tf.get_variable("W1", shape=[input_size, output_size],
initializer=tf.contrib.layers.xavier_initializer())
Qpred = tf.matmul(X, W1)
# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(shape=[None, output_size], dtype=tf.float32)
# Loss function
loss = tf.reduce_sum(tf.square(Y - Qpred))
# Learning
train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
# Values for q learning
max_episodes = 5000
dis = 0.9
step_history = []
# Setting up our environment
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for episode in range(max_episodes):
e = 1. / ((episode / 10) + 1)
step_count = 0
state = env.reset()
done = False
# The Q-Network training
while not done:
step_count += 1
x = np.reshape(state, [1, input_size])
# Choose an action by greedily (with e chance of random action) from
# the Q-network
Q = sess.run(Qpred, feed_dict={X: x})
if np.random.rand(1) < e:
action = env.action_space.sample()
else:
action = np.argmax(Q)
# Get new state and reward from environment
next_state, reward, done, _ = env.step(action)
if done:
Q[0, action] = -100
else:
x_next = np.reshape(next_state, [1, input_size])
# Obtain the Q' values by feeding the new state through our network
Q_next = sess.run(Qpred, feed_dict={X: x_next})
Q[0, action] = reward + dis * np.max(Q_next)
# Train our network using target and predicted Q values on each episode
sess.run(train, feed_dict={X: x, Y: Q})
state = next_state
step_history.append(step_count)
print("Episode: {} steps: {}".format(episode, step_count))
# If last 10's avg steps are 500, it's good enough
if len(step_history) > 10 and np.mean(step_history[-10:]) > 500:
break
# See our trained network in action
observation = env.reset()
reward_sum = 0
while True:
env.render()
x = np.reshape(observation, [1, input_size])
Q = sess.run(Qpred, feed_dict={X: x})
action = np.argmax(Q)
observation, reward, done, _ = env.step(action)
reward_sum += reward
if done:
print("Total score: {}".format(reward_sum))
break