-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathactor_critic_baseline_cart_pole.py
159 lines (116 loc) · 5.42 KB
/
actor_critic_baseline_cart_pole.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# note must import tensorflow before gym
import pickle
from collections import deque
import tensorflow as tf
import gym
import numpy as np
env = gym.make('CartPole-v0')
ACTIONS_COUNT = 2
FUTURE_REWARD_DISCOUNT = 0.9
LEARN_RATE_ACTOR = 0.01
LEARN_RATE_CRITIC = 0.01
STORE_SCORES_LEN = 5
GAMES_PER_TRAINING = 3
INPUT_NODES = env.observation_space.shape[0]
ACTOR_HIDDEN = 20
session = tf.Session()
actor_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, ACTOR_HIDDEN], stddev=0.01))
actor_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[ACTOR_HIDDEN]))
actor_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([ACTOR_HIDDEN, ACTIONS_COUNT], stddev=0.01))
actor_feed_forward_bias_2 = tf.Variable(tf.constant(0.1, shape=[ACTIONS_COUNT]))
actor_input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
actor_hidden_layer = tf.nn.tanh(
tf.matmul(actor_input_placeholder, actor_feed_forward_weights_1) + actor_feed_forward_bias_1)
actor_output_layer = tf.nn.softmax(
tf.matmul(actor_hidden_layer, actor_feed_forward_weights_2) + actor_feed_forward_bias_2)
actor_action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT])
actor_advantage_placeholder = tf.placeholder("float", [None, 1])
policy_gradient = tf.reduce_mean(actor_advantage_placeholder * actor_action_placeholder * tf.log(actor_output_layer))
actor_train_operation = tf.train.AdamOptimizer(LEARN_RATE_ACTOR).minimize(-policy_gradient)
CRITIC_HIDDEN = 20
critic_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, CRITIC_HIDDEN], stddev=0.01))
critic_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[CRITIC_HIDDEN]))
critic_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([CRITIC_HIDDEN, 1], stddev=0.01))
critic_feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[1]))
critic_input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
critic_hidden_layer = tf.nn.tanh(
tf.matmul(critic_input_placeholder, critic_feed_forward_weights_1) + critic_feed_forward_bias_1)
critic_output_layer = tf.matmul(critic_hidden_layer, critic_feed_forward_weights_2) + critic_feed_forward_bias_2
critic_target_placeholder = tf.placeholder("float", [None, 1])
critic_cost = tf.reduce_mean(tf.square(critic_target_placeholder - critic_output_layer))
critic_train_operation = tf.train.AdamOptimizer(LEARN_RATE_CRITIC).minimize(critic_cost)
critic_advantages = critic_target_placeholder - critic_output_layer
scores = deque(maxlen=STORE_SCORES_LEN)
# set the first action to do nothing
last_action = np.zeros(ACTIONS_COUNT)
last_action[1] = 1
time = 0
session.run(tf.initialize_all_variables())
def choose_next_action(state):
probability_of_actions = session.run(actor_output_layer, feed_dict={actor_input_placeholder: [state]})[0]
try:
move = np.random.multinomial(1, probability_of_actions)
except ValueError:
# sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
# so need to reduce slightly to be a valid value
move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-6))
return move
def train(states, actions_taken, advantages):
# learn that these actions in these states lead to this reward
session.run(actor_train_operation, feed_dict={
actor_input_placeholder: states,
actor_action_placeholder: actions_taken,
actor_advantage_placeholder: advantages})
last_state = env.reset()
total_reward = 0
current_game_observations = []
current_game_rewards = []
current_game_actions = []
episode_observation = []
episode_rewards = []
episode_actions = []
games = 0
critic_costs = deque(maxlen=100)
while True:
env.render()
last_action = choose_next_action(last_state)
current_state, reward, terminal, info = env.step(np.argmax(last_action))
total_reward += reward
if terminal:
reward = -.10
current_game_observations.append(last_state)
current_game_rewards.append(reward)
current_game_actions.append(last_action)
if terminal:
games += 1
scores.append(total_reward)
# get temporal difference values for critic
cumulative_reward = 0
for i in reversed(range(len(current_game_observations))):
cumulative_reward = current_game_rewards[i] + FUTURE_REWARD_DISCOUNT * cumulative_reward
current_game_rewards[i] = [cumulative_reward]
_, cost, advantages = session.run([critic_train_operation, critic_cost, critic_advantages], {
critic_input_placeholder: current_game_observations,
critic_target_placeholder: current_game_rewards})
critic_costs.append(cost)
print("Game: %s reward %s average scores %s critic cost %s" %
(games, total_reward,
np.mean(scores), np.mean(critic_costs)))
episode_observation.extend(current_game_observations)
episode_actions.extend(current_game_actions)
episode_rewards.extend(advantages)
total_reward = 0
current_game_observations = []
current_game_rewards = []
current_game_actions = []
if games % GAMES_PER_TRAINING == 0:
train(episode_observation, episode_actions, episode_rewards)
episode_observation = []
episode_actions = []
episode_rewards = []
time += 1
# update the old values
if terminal:
last_state = env.reset()
else:
last_state = current_state