-
Notifications
You must be signed in to change notification settings - Fork 0
/
4_3.py
59 lines (36 loc) · 1.21 KB
/
4_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
import matplotlib.pyplot as plt
import random
STATES=np.arange(1,100,dtype=int)
v=np.zeros(101)
v[100]=1
ph=0.4
while True:
newv=np.copy(v)
for state in STATES:
rewards=[]
for action in np.arange(min(state,100-state)+1):
reward=ph*v[state+action]+(1-ph)*v[state-action]
rewards.append(reward)
newv[state]=np.max(rewards)
value_change=abs(newv-v).sum()
print('value change:'+str(value_change))
if value_change<1e-9:
print(np.transpose(v))
break
v=newv
policy=np.zeros(101)
policy[100]=0
for state in STATES:
rewards=[]
for action in np.arange(min(state,100-state)+1):
reward=ph*v[state+action]+(1-ph)*v[state-action]
rewards.append(reward)
## https://github.com/ShangtongZhang/reinforcement-learning-an-introduction/issues/83
action=np.arange(min(state,100-state)+1)[np.argmax(np.round(rewards[1:],5))+1]
policy[state]=action
print(np.transpose(policy))
fig,ax=plt.subplots(2,1)
ax[0].plot(v)
ax[1].scatter(np.arange(101),np.transpose(policy))
plt.show()