-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_pseudo.py
57 lines (31 loc) · 1.2 KB
/
main_pseudo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
import random
BUFFER_LEN = 100
MINI_BATCH_SIZE = 50
batch = []
reward_plots = []
for episode in range(max_episode):
currentState = env_instance.get_initial_state(episode) #Initial state for the day of the training data.
for time in range(T):
# 1. Based on 2-step look-ahead, choose an action.
action = agent_instance.choose_action(currentState)
# 2. Execute the action in simulation
nextState, reward, condition, finalState = env_instance.next_step(currentState, time)
# 3. Grow the experience batch with (S,A,R,S1) and perform conditional checking.
# NOTE - Perform conditional checking on reward in the Function Approx module.
experience = (currentState, kQmax) # (currentState, Q)
if len(batch) > BUFFER_LEN:
#4. Refresh the batch and update the Q-function.
minibatch = random.sample(batch, MINI_BATCH_SIZE)
function_approx_instance.update_qfunction(minibatch, agent_instance)
batch = [] #refresh batch
else:
batch.append(experience)
if condition == False:
reward_plots.append(reward)
break
currentState = nextState
reward_plots.append(reward)
#Annealing
agent_instance.epsilon -= 1/()
agent_instance.alpha -=