Skip to content

Commit

Permalink
hw2 solved
Browse files Browse the repository at this point in the history
  • Loading branch information
hyeok9855 committed Sep 8, 2023
1 parent eccd142 commit daf0d7c
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 25 deletions.
24 changes: 17 additions & 7 deletions hw2/cs285/agents/pg_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def train(self, observations, actions, rewards_list, next_observations, terminal
# HINT1: use helper functions to compute qvals and advantages
# HINT2: look at the MLPPolicyPG class for how to update the policy
# and obtain a train_log

qvals = self.calculate_q_vals(rewards_list)
advantages = self.estimate_advantage(observations, rewards_list, qvals, terminals)
train_log = self.actor.update(observations, actions, advantages, q_values=qvals if self.nn_baseline else None)
return train_log

def calculate_q_vals(self, rewards_list):
Expand All @@ -70,12 +72,12 @@ def calculate_q_vals(self, rewards_list):
# ordering as observations, actions, etc.

if not self.reward_to_go:
TODO
q_values = np.concatenate([self._discounted_return(sub_list) for sub_list in rewards_list], axis=0)

# Case 2: reward-to-go PG
# Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t
else:
TODO
q_values = np.concatenate([self._discounted_cumsum(sub_list) for sub_list in rewards_list], axis=0)

return q_values

Expand All @@ -95,7 +97,7 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals):
## TODO: values were trained with standardized q_values, so ensure
## that the predictions have the same mean and standard deviation as
## the current batch of q_values
values = TODO
values = values_unnormalized * q_values.std() + q_values.mean()

if self.gae_lambda is not None:
## append a dummy T+1 value for simpler recursive calculation
Expand All @@ -117,13 +119,15 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals):
## 0 otherwise.
## HINT 2: self.gae_lambda is the lambda value in the
## GAE formula
delta = rews[i] + self.gamma * (values[i + 1] * (1 - terminals[i])) - values[i]
advantages[i] = delta + self.gamma * self.gae_lambda * advantages[i + 1] * (1 - terminals[i])

# remove dummy advantage
advantages = advantages[:-1]

else:
## TODO: compute advantage estimates using q_values, and values as baselines
advantages = TODO
advantages = q_values - values

# Else, just set the advantage to [Q]
else:
Expand All @@ -133,7 +137,7 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals):
if self.standardize_advantages:
## TODO: standardize the advantages to have a mean of zero
## and a standard deviation of one
advantages = TODO
advantages = (advantages - advantages.mean()) / advantages.std()

return advantages

Expand All @@ -160,6 +164,7 @@ def _discounted_return(self, rewards):
"""

# TODO: create list_of_discounted_returns
list_of_discounted_returns = [sum((self.gamma ** _t) * _r for _t, _r in enumerate(rewards))] * len(rewards)

return list_of_discounted_returns

Expand All @@ -173,5 +178,10 @@ def _discounted_cumsum(self, rewards):
# TODO: create `list_of_discounted_returns`
# HINT: it is possible to write a vectorized solution, but a solution
# using a for loop is also fine

list_of_discounted_cumsums = []
sum_last = 0
for i in range(len(rewards)):
sum_last = sum_last * self.gamma + rewards[-(i + 1)]
list_of_discounted_cumsums.append(sum_last)
list_of_discounted_cumsums.reverse()
return list_of_discounted_cumsums
59 changes: 51 additions & 8 deletions hw2/cs285/infrastructure/rl_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time

import gym
from gym import spaces
from gym import wrappers
import numpy as np
import torch
Expand Down Expand Up @@ -63,7 +64,7 @@ def __init__(self, params):
MAX_VIDEO_LEN = self.params['ep_len']

# Is this env continuous, or self.discrete?
discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
discrete = isinstance(self.env.action_space, spaces.Discrete)
# Are the observations images?
img = len(self.env.observation_space.shape) > 2

Expand Down Expand Up @@ -116,9 +117,9 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,

# decide if videos should be rendered/logged at this iteration
if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1:
self.logvideo = True
self.log_video = True
else:
self.logvideo = False
self.log_video = False

# decide if metrics should be logged
if self.params['scalar_log_freq'] == -1:
Expand All @@ -142,7 +143,7 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
train_logs = self.train_agent()

# log/save
if self.logvideo or self.logmetrics:
if self.log_video or self.logmetrics:
# perform logging
print('\nBeginning logging procedure...')
self.perform_logging(itr, paths, eval_policy, train_video_paths, train_logs)
Expand All @@ -153,11 +154,53 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
####################################
####################################

def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, batch_size):
# TODO: GETTHIS from HW1
def collect_training_trajectories(
self,
itr,
load_initial_expertdata,
collect_policy,
batch_size,
):
"""
:param itr:
:param load_initial_expertdata: path to expert data pkl file
:param collect_policy: the current policy using which we collect data
:param batch_size: the number of transitions we collect
:return:
paths: a list trajectories
envsteps_this_batch: the sum over the numbers of environment steps in paths
train_video_paths: paths which also contain videos for visualization purposes
"""
# collect `batch_size` samples to be used for training
print("\nCollecting data to be used for training...")
paths, envsteps_this_batch = None, 0
train_video_paths = None

paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'])

# collect more rollouts with the same policy, to be saved as videos in tensorboard
# note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
if self.log_video:
print('\nCollecting train rollouts to be used for saving videos...')
train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)

return paths, envsteps_this_batch, train_video_paths

def train_agent(self):
# TODO: GETTHIS from HW1
print('\nTraining agent using sampled data from replay buffer...')
all_logs = []
for train_step in range(self.params['num_agent_train_steps_per_iter']):
# sample some data from the data buffer
(
ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch
) = self.agent.sample(self.params["train_batch_size"])

# use the sampled data to train an agent
for i in range(self.params['num_gradient_steps_per_traj']):
train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch)
# keep the agent's training log for debugging
all_logs.append(train_log) # type: ignore
return all_logs

####################################
####################################
Expand All @@ -173,7 +216,7 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs):
eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len'])

# save eval rollouts as videos in tensorboard event file
if self.logvideo and train_video_paths != None:
if self.log_video and train_video_paths != None:
print('\nCollecting video rollouts eval')
eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)

Expand Down
64 changes: 61 additions & 3 deletions hw2/cs285/infrastructure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,71 @@ def mean_squared_error(a, b):
############################################

def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
# TODO: get this from hw1

# initialize env for the beginning of a new rollout
ob = env.reset() # HINT: should be the output of resetting the env

# init vars
obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
steps = 0
while True:

# render image of the simulated env
if render:
if 'rgb_array' in render_mode:
# if hasattr(env, 'sim'):
# image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
# else:
image_obs.append(env.render(mode=render_mode))
if 'human' in render_mode:
env.render(mode=render_mode)
time.sleep(env.model.opt.timestep)

# use the most recent ob to decide what to do
obs.append(ob)
ac = policy.get_action(ob) # HINT: query the policy's get_action function
ac = ac[0]
acs.append(ac)

# take that action and record results
ob, rew, done, _ = env.step(ac)

# record result of taking that action
steps += 1
next_obs.append(ob)
rewards.append(rew)

# rollout can end due to done, or due to max_path_length
rollout_done = 1 if done or steps >= max_path_length else 0
terminals.append(rollout_done)

if rollout_done:
break

return Path(obs, image_obs, acs, rewards, next_obs, terminals)

def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
# TODO: get this from hw1
"""
Collect rollouts until we have collected min_timesteps_per_batch steps.
"""
paths = []
timesteps_this_batch = 0
while timesteps_this_batch < min_timesteps_per_batch:
path = sample_trajectory(env, policy, max_path_length, render, render_mode)
paths.append(path)
timesteps_this_batch += get_pathlength(path)

return paths, timesteps_this_batch

def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
# TODO: get this from hw1
"""
Collect ntraj rollouts.
"""
paths = []
for _ in range(ntraj):
paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode))

return paths

############################################
############################################
Expand Down
36 changes: 30 additions & 6 deletions hw2/cs285/policies/MLP_policy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import abc
import itertools
from typing import cast
from torch import nn
from torch.nn import functional as F
from torch import optim
Expand Down Expand Up @@ -52,7 +53,7 @@ def __init__(self,
self.mean_net = ptu.build_mlp(input_size=self.ob_dim,
output_size=self.ac_dim,
n_layers=self.n_layers, size=self.size)
self.logstd = nn.Parameter(
self.logstd = nn.parameter.Parameter(
torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
)
self.mean_net.to(ptu.device)
Expand Down Expand Up @@ -86,7 +87,15 @@ def save(self, filepath):

# query the policy with observation(s) to get selected action(s)
def get_action(self, obs: np.ndarray) -> np.ndarray:
# TODO: get this from HW1
if len(obs.shape) > 1:
observation = obs
else:
observation = obs[None]

observation = ptu.from_numpy(observation.astype(np.float32))

distn = self(observation)
return ptu.to_numpy(distn.sample())

# update/train this policy
def update(self, observations, actions, **kwargs):
Expand All @@ -97,12 +106,14 @@ def update(self, observations, actions, **kwargs):
# through it. For example, you can return a torch.FloatTensor. You can also
# return more flexible objects, such as a
# `torch.distributions.Distribution` object. It's up to you!
def forward(self, observation: torch.FloatTensor):
def forward(self, observation: torch.FloatTensor) -> torch.distributions.Distribution:
if self.discrete:
assert self.logits_na is not None
logits = self.logits_na(observation)
action_distribution = distributions.Categorical(logits=logits)
return action_distribution
else:
assert self.mean_net is not None and self.logstd is not None
batch_mean = self.mean_net(observation)
scale_tril = torch.diag(torch.exp(self.logstd))
batch_dim = batch_mean.shape[0]
Expand All @@ -123,7 +134,7 @@ def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs):
self.baseline_loss = nn.MSELoss()

def update(self, observations, actions, advantages, q_values=None):
observations = ptu.from_numpy(observations)
observations = cast(torch.FloatTensor, ptu.from_numpy(observations))
actions = ptu.from_numpy(actions)
advantages = ptu.from_numpy(advantages)

Expand All @@ -137,7 +148,12 @@ def update(self, observations, actions, advantages, q_values=None):
# HINT4: use self.optimizer to optimize the loss. Remember to
# 'zero_grad' first

TODO
log_probs = self.forward(observations).log_prob(actions)
loss = -(log_probs * advantages).mean()

self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

if self.nn_baseline:
## TODO: update the neural network baseline using the q_values as
Expand All @@ -148,8 +164,15 @@ def update(self, observations, actions, advantages, q_values=None):
## updating the baseline. Remember to 'zero_grad' first
## HINT2: You will need to convert the targets into a tensor using
## ptu.from_numpy before using it in the loss
assert isinstance(q_values, np.ndarray) and isinstance(self.baseline, nn.Module)
q_values = (q_values - q_values.mean()) / q_values.std()
q_values = ptu.from_numpy(q_values)
baseline_v = self.baseline(observations).squeeze()
baseline_loss = self.baseline_loss(baseline_v, q_values)

TODO
self.baseline_optimizer.zero_grad()
baseline_loss.backward()
self.baseline_optimizer.step()

train_log = {
'Training Loss': ptu.to_numpy(loss),
Expand All @@ -166,6 +189,7 @@ def run_baseline_prediction(self, observations):
Output: np.ndarray of size [N]
"""
assert self.baseline is not None
observations = ptu.from_numpy(observations)
pred = self.baseline(observations)
return ptu.to_numpy(pred.squeeze())
Loading

0 comments on commit daf0d7c

Please sign in to comment.