-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy patharguments.py
162 lines (157 loc) · 8.58 KB
/
arguments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
import json
import argparse
import numpy as np
from tensorboardX import SummaryWriter
def get_args():
parser = argparse.ArgumentParser(
description='PyTorch Policy Gradient (Bayesian Quadrature/Monte Carlo)'
)
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# General arguments
parser.add_argument('--env-name',
default="Swimmer-v2",
metavar='G',
help='Name of the gym environment to run')
parser.add_argument('--gamma',
type=float,
default=0.995,
metavar='G',
help='discount factor (default: 0.995)')
parser.add_argument('--batch-size',
type=int,
default=15000,
metavar='N',
help='state-action sample size (default: 15000)')
parser.add_argument('--pg_algorithm',
default="VanillaPG",
help=
'TRPO | VanillaPG | NPG. Selecting the policy optimization technique')
parser.add_argument('--render',
action='store_true',
help='renders the policy roll-out in the environment')
parser.add_argument('--output_directory',
default="session_logs/",
metavar='G',
help='writes the session logs to this directory')
parser.add_argument('--gpu_id',
default="0",
metavar='G',
help=
'Mention the target GPU for deployment. Our GP kernel learning does not support multi-gpu training.')
parser.add_argument('--seed',
type=int,
default=-1,
metavar='N',
help='random seed (default: 1). Useful for debugging.')
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# GAE arguments
parser.add_argument("--advantage_flag",
action='store_true',
help=
"Replaces Monte-Carlo/TD(1) action-value estimates with generalized advantage estimates (GAE)")
parser.add_argument('--tau',
type=float,
default=0.97,
metavar='G',
help='GAE exponentially-weighted average coefficient (default: 0.97)')
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# LR for VanillaPG and NPG
parser.add_argument('--lr',
type=float,
default=7e-4,
metavar='G',
help='learning rate (default: 1e-1)')
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# TRPO arguments
parser.add_argument('--max-kl',
type=float,
default=1e-2,
metavar='G',
help=
'Trust region size, i.e., the max allowed KL divergence between the old and updated policy (default: 1e-2)')
parser.add_argument('--damping',
type=float,
default=1e-1,
metavar='G',
help=
'Damping coefficient. For numerical stablility and quick convergence of Fisher inverse computation using Conjugate Gradient.')
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# Policy gradient estimator arguments
parser.add_argument('--pg_estimator',
default="BQ",
metavar='G',
help=
'BQ: Bayesian Quadrature | MC: Monte Carlo - Selects the PG estimator.')
if parser.parse_known_args()[0].pg_estimator == 'BQ':
parser.add_argument('--svd_low_rank',
type=int,
default=-1,
metavar='N',
help=
'specified the (low) rank for approximating the U and Cov matrices with FastSVD')
parser.add_argument('--fisher_coefficient',
type=float,
default=5e-5,
metavar='G',
help=
"The coefficient of Fisher kernel, i.e. c_2, in the PG estimate U(c_1 K_s + c_2 K_f + sigma^2 I)^{-1} A^{GAE}")
parser.add_argument('--state_coefficient',
type=float,
default=1,
metavar='G',
help=
"The coefficient of State kernel, i.e. c_1, in the PG estimate U(c_1 K_s + c_2 K_f + sigma^2 I)^{-1} A^{GAE}")
parser.add_argument('--likelihood_noise_level',
type=float,
default=1e-4,
metavar='G',
help='GPs noise variance sigma^2')
parser.add_argument("--UAPG_flag",
action='store_true',
help=
"If true then the gradient covariance is used for computing UAPG updates")
parser.add_argument('--UAPG_epsilon',
type=float,
default=3.0,
metavar='G',
help=
'Maximum factor by which a DBQPG compoment stepsize is increased during the UAPG update (for NPG or TRPO)')
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
if args.seed == -1:
args.seed = int(np.random.randint(low=0, high=100000000, size=1)[0])
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# Hyperparameter Helper for few MuJoCo environments (feel free to comment this section for a manual overide)
with open('helper_config.json') as config:
config = json.load(config)
if args.env_name in config:
args.advantage_flag = config[args.env_name]["advantage_flag"]
args.svd_low_rank = config[args.env_name][
args.pg_algorithm]["svd_low_rank"]
if args.pg_algorithm != 'TRPO':
args.lr = config[args.env_name][args.pg_algorithm]["lr"]
if args.pg_estimator == 'MC' and 'MC_lr' in config[args.env_name][
args.pg_algorithm]:
args.lr = config[args.env_name][args.pg_algorithm]["MC_lr"]
if args.svd_low_rank == -1:
args.svd_low_rank = 50
#--------------------------------------------------------------------------------------------------------------------------------------------------------
# Logs the cumulative reward statistics over episodes
prefix = args.pg_estimator + "_" + args.pg_algorithm
write_filename = prefix + "_" + args.env_name + "_GAE(" + str(
args.advantage_flag) + ")_LR(" + str(args.lr)
if args.pg_estimator == "BQ":
write_filename = write_filename + ")_SVDrank(" + str(
args.svd_low_rank) + ")_UAPG(" + str(
args.UAPG_flag) + ")_UAPGeps(" + str(args.UAPG_epsilon)
write_filename = write_filename + ")_batchsize(" + str(
args.batch_size) + ")_seed(" + str(args.seed) + ")"
pg_estimator_name = args.pg_estimator if args.pg_estimator == "MC" else "UAPG" if args.UAPG_flag else "DBQPG"
final_directory = os.path.join(args.output_directory, args.env_name, args.pg_algorithm, pg_estimator_name, write_filename)
if not os.path.exists(final_directory):
os.makedirs(final_directory)
summa_writer = SummaryWriter(logdir=final_directory,
comment=pg_estimator_name + "-PG")
#--------------------------------------------------------------------------------------------------------------------------------------------------------
return args, summa_writer