-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicy.py
162 lines (119 loc) · 4.08 KB
/
policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from __future__ import division
import numpy as np
from util import *
class Policy(object):
"""Abstract base class for all implemented policies.
Each policy helps with selection of action to take on an environment.
Do not use this abstract base class directly but instead use one of the concrete policies implemented.
To implement your own policy, you have to implement the following methods:
- `select_action`
# Arguments
agent (rl.core.Agent): Agent used
"""
def _set_agent(self, agent):
self.agent = agent
@property
def metrics_names(self):
return []
@property
def metrics(self):
return []
def select_action(self, **kwargs):
raise NotImplementedError()
def get_config(self):
"""Return configuration of the policy
# Returns
Configuration as dict
"""
return {}
class SoftmaxPolicy(Policy):
""" Implement softmax policy for multinimial distribution
Simple Policy
- takes action according to the pobability distribution
"""
def select_action(self, nb_actions, probs):
"""Return the selected action
# Arguments
probs (np.ndarray) : Probabilty for each action
# Returns
action
"""
action = np.random.choice(range(nb_actions), p=probs)
return action
class EpsGreedyQPolicy(Policy):
"""Implement the epsilon greedy policy
Eps Greedy policy either:
- takes a random action with probability epsilon
- takes current best action with prob (1 - epsilon)
"""
def __init__(self, eps=.1):
super(EpsGreedyQPolicy, self).__init__()
self.eps = eps
def select_action(self, q_values):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
nb_actions = q_values.shape[0]
if np.random.uniform() < self.eps:
action = np.random.random_integers(0, nb_actions-1)
else:
action = np.argmax(q_values)
return action
def get_config(self):
"""Return configurations of EpsGreedyQPolicy
# Returns
Dict of config
"""
config = super(EpsGreedyQPolicy, self).get_config()
config['eps'] = self.eps
return config
class GreedyQPolicy(Policy):
"""Implement the greedy policy
Greedy policy returns the current best action according to q_values
"""
def select_action(self, q_values):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
action = np.argmax(q_values)
return action
class BoltzmannQPolicy(Policy):
"""Implement the Boltzmann Q Policy
Boltzmann Q Policy builds a probability law on q values and returns
an action selected randomly according to this law.
"""
def __init__(self, tau=1., clip=(-500., 500.)):
super(BoltzmannQPolicy, self).__init__()
self.tau = tau
self.clip = clip
def select_action(self, q_values):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
q_values = q_values.astype('float64')
nb_actions = q_values.shape[0]
exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1]))
probs = exp_values / np.sum(exp_values)
action = np.random.choice(range(nb_actions), p=probs)
return action
def get_config(self):
"""Return configurations of BoltzmannQPolicy
# Returns
Dict of config
"""
config = super(BoltzmannQPolicy, self).get_config()
config['tau'] = self.tau
config['clip'] = self.clip
return config