From daf0d7cbbfa814f0d5b422d3123fbf5a61530b63 Mon Sep 17 00:00:00 2001 From: hyeok9855 Date: Fri, 8 Sep 2023 15:32:56 +0900 Subject: [PATCH] hw2 solved --- hw2/cs285/agents/pg_agent.py | 24 +++++++--- hw2/cs285/infrastructure/rl_trainer.py | 59 ++++++++++++++++++++---- hw2/cs285/infrastructure/utils.py | 64 ++++++++++++++++++++++++-- hw2/cs285/policies/MLP_policy.py | 36 ++++++++++++--- hw2/cs285/scripts/run_hw2.py | 19 +++++++- 5 files changed, 177 insertions(+), 25 deletions(-) diff --git a/hw2/cs285/agents/pg_agent.py b/hw2/cs285/agents/pg_agent.py index 5a135c20..dd5e66f4 100644 --- a/hw2/cs285/agents/pg_agent.py +++ b/hw2/cs285/agents/pg_agent.py @@ -45,7 +45,9 @@ def train(self, observations, actions, rewards_list, next_observations, terminal # HINT1: use helper functions to compute qvals and advantages # HINT2: look at the MLPPolicyPG class for how to update the policy # and obtain a train_log - + qvals = self.calculate_q_vals(rewards_list) + advantages = self.estimate_advantage(observations, rewards_list, qvals, terminals) + train_log = self.actor.update(observations, actions, advantages, q_values=qvals if self.nn_baseline else None) return train_log def calculate_q_vals(self, rewards_list): @@ -70,12 +72,12 @@ def calculate_q_vals(self, rewards_list): # ordering as observations, actions, etc. if not self.reward_to_go: - TODO + q_values = np.concatenate([self._discounted_return(sub_list) for sub_list in rewards_list], axis=0) # Case 2: reward-to-go PG # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t else: - TODO + q_values = np.concatenate([self._discounted_cumsum(sub_list) for sub_list in rewards_list], axis=0) return q_values @@ -95,7 +97,7 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals): ## TODO: values were trained with standardized q_values, so ensure ## that the predictions have the same mean and standard deviation as ## the current batch of q_values - values = TODO + values = values_unnormalized * q_values.std() + q_values.mean() if self.gae_lambda is not None: ## append a dummy T+1 value for simpler recursive calculation @@ -117,13 +119,15 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals): ## 0 otherwise. ## HINT 2: self.gae_lambda is the lambda value in the ## GAE formula + delta = rews[i] + self.gamma * (values[i + 1] * (1 - terminals[i])) - values[i] + advantages[i] = delta + self.gamma * self.gae_lambda * advantages[i + 1] * (1 - terminals[i]) # remove dummy advantage advantages = advantages[:-1] else: ## TODO: compute advantage estimates using q_values, and values as baselines - advantages = TODO + advantages = q_values - values # Else, just set the advantage to [Q] else: @@ -133,7 +137,7 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals): if self.standardize_advantages: ## TODO: standardize the advantages to have a mean of zero ## and a standard deviation of one - advantages = TODO + advantages = (advantages - advantages.mean()) / advantages.std() return advantages @@ -160,6 +164,7 @@ def _discounted_return(self, rewards): """ # TODO: create list_of_discounted_returns + list_of_discounted_returns = [sum((self.gamma ** _t) * _r for _t, _r in enumerate(rewards))] * len(rewards) return list_of_discounted_returns @@ -173,5 +178,10 @@ def _discounted_cumsum(self, rewards): # TODO: create `list_of_discounted_returns` # HINT: it is possible to write a vectorized solution, but a solution # using a for loop is also fine - + list_of_discounted_cumsums = [] + sum_last = 0 + for i in range(len(rewards)): + sum_last = sum_last * self.gamma + rewards[-(i + 1)] + list_of_discounted_cumsums.append(sum_last) + list_of_discounted_cumsums.reverse() return list_of_discounted_cumsums diff --git a/hw2/cs285/infrastructure/rl_trainer.py b/hw2/cs285/infrastructure/rl_trainer.py index 66c408e6..f3414933 100644 --- a/hw2/cs285/infrastructure/rl_trainer.py +++ b/hw2/cs285/infrastructure/rl_trainer.py @@ -5,6 +5,7 @@ import time import gym +from gym import spaces from gym import wrappers import numpy as np import torch @@ -63,7 +64,7 @@ def __init__(self, params): MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? - discrete = isinstance(self.env.action_space, gym.spaces.Discrete) + discrete = isinstance(self.env.action_space, spaces.Discrete) # Are the observations images? img = len(self.env.observation_space.shape) > 2 @@ -116,9 +117,9 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy, # decide if videos should be rendered/logged at this iteration if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: - self.logvideo = True + self.log_video = True else: - self.logvideo = False + self.log_video = False # decide if metrics should be logged if self.params['scalar_log_freq'] == -1: @@ -142,7 +143,7 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy, train_logs = self.train_agent() # log/save - if self.logvideo or self.logmetrics: + if self.log_video or self.logmetrics: # perform logging print('\nBeginning logging procedure...') self.perform_logging(itr, paths, eval_policy, train_video_paths, train_logs) @@ -153,11 +154,53 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy, #################################### #################################### - def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, batch_size): - # TODO: GETTHIS from HW1 + def collect_training_trajectories( + self, + itr, + load_initial_expertdata, + collect_policy, + batch_size, + ): + """ + :param itr: + :param load_initial_expertdata: path to expert data pkl file + :param collect_policy: the current policy using which we collect data + :param batch_size: the number of transitions we collect + :return: + paths: a list trajectories + envsteps_this_batch: the sum over the numbers of environment steps in paths + train_video_paths: paths which also contain videos for visualization purposes + """ + # collect `batch_size` samples to be used for training + print("\nCollecting data to be used for training...") + paths, envsteps_this_batch = None, 0 + train_video_paths = None + + paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len']) + + # collect more rollouts with the same policy, to be saved as videos in tensorboard + # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN + if self.log_video: + print('\nCollecting train rollouts to be used for saving videos...') + train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) + + return paths, envsteps_this_batch, train_video_paths def train_agent(self): - # TODO: GETTHIS from HW1 + print('\nTraining agent using sampled data from replay buffer...') + all_logs = [] + for train_step in range(self.params['num_agent_train_steps_per_iter']): + # sample some data from the data buffer + ( + ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch + ) = self.agent.sample(self.params["train_batch_size"]) + + # use the sampled data to train an agent + for i in range(self.params['num_gradient_steps_per_traj']): + train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) + # keep the agent's training log for debugging + all_logs.append(train_log) # type: ignore + return all_logs #################################### #################################### @@ -173,7 +216,7 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file - if self.logvideo and train_video_paths != None: + if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) diff --git a/hw2/cs285/infrastructure/utils.py b/hw2/cs285/infrastructure/utils.py index d23b4d3e..96e9f6e7 100644 --- a/hw2/cs285/infrastructure/utils.py +++ b/hw2/cs285/infrastructure/utils.py @@ -55,13 +55,71 @@ def mean_squared_error(a, b): ############################################ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): - # TODO: get this from hw1 + + # initialize env for the beginning of a new rollout + ob = env.reset() # HINT: should be the output of resetting the env + + # init vars + obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] + steps = 0 + while True: + + # render image of the simulated env + if render: + if 'rgb_array' in render_mode: + # if hasattr(env, 'sim'): + # image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) + # else: + image_obs.append(env.render(mode=render_mode)) + if 'human' in render_mode: + env.render(mode=render_mode) + time.sleep(env.model.opt.timestep) + + # use the most recent ob to decide what to do + obs.append(ob) + ac = policy.get_action(ob) # HINT: query the policy's get_action function + ac = ac[0] + acs.append(ac) + + # take that action and record results + ob, rew, done, _ = env.step(ac) + + # record result of taking that action + steps += 1 + next_obs.append(ob) + rewards.append(rew) + + # rollout can end due to done, or due to max_path_length + rollout_done = 1 if done or steps >= max_path_length else 0 + terminals.append(rollout_done) + + if rollout_done: + break + + return Path(obs, image_obs, acs, rewards, next_obs, terminals) def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): - # TODO: get this from hw1 + """ + Collect rollouts until we have collected min_timesteps_per_batch steps. + """ + paths = [] + timesteps_this_batch = 0 + while timesteps_this_batch < min_timesteps_per_batch: + path = sample_trajectory(env, policy, max_path_length, render, render_mode) + paths.append(path) + timesteps_this_batch += get_pathlength(path) + + return paths, timesteps_this_batch def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): - # TODO: get this from hw1 + """ + Collect ntraj rollouts. + """ + paths = [] + for _ in range(ntraj): + paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode)) + + return paths ############################################ ############################################ diff --git a/hw2/cs285/policies/MLP_policy.py b/hw2/cs285/policies/MLP_policy.py index c6f3b463..7ae46ccc 100644 --- a/hw2/cs285/policies/MLP_policy.py +++ b/hw2/cs285/policies/MLP_policy.py @@ -1,5 +1,6 @@ import abc import itertools +from typing import cast from torch import nn from torch.nn import functional as F from torch import optim @@ -52,7 +53,7 @@ def __init__(self, self.mean_net = ptu.build_mlp(input_size=self.ob_dim, output_size=self.ac_dim, n_layers=self.n_layers, size=self.size) - self.logstd = nn.Parameter( + self.logstd = nn.parameter.Parameter( torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) ) self.mean_net.to(ptu.device) @@ -86,7 +87,15 @@ def save(self, filepath): # query the policy with observation(s) to get selected action(s) def get_action(self, obs: np.ndarray) -> np.ndarray: - # TODO: get this from HW1 + if len(obs.shape) > 1: + observation = obs + else: + observation = obs[None] + + observation = ptu.from_numpy(observation.astype(np.float32)) + + distn = self(observation) + return ptu.to_numpy(distn.sample()) # update/train this policy def update(self, observations, actions, **kwargs): @@ -97,12 +106,14 @@ def update(self, observations, actions, **kwargs): # through it. For example, you can return a torch.FloatTensor. You can also # return more flexible objects, such as a # `torch.distributions.Distribution` object. It's up to you! - def forward(self, observation: torch.FloatTensor): + def forward(self, observation: torch.FloatTensor) -> torch.distributions.Distribution: if self.discrete: + assert self.logits_na is not None logits = self.logits_na(observation) action_distribution = distributions.Categorical(logits=logits) return action_distribution else: + assert self.mean_net is not None and self.logstd is not None batch_mean = self.mean_net(observation) scale_tril = torch.diag(torch.exp(self.logstd)) batch_dim = batch_mean.shape[0] @@ -123,7 +134,7 @@ def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs): self.baseline_loss = nn.MSELoss() def update(self, observations, actions, advantages, q_values=None): - observations = ptu.from_numpy(observations) + observations = cast(torch.FloatTensor, ptu.from_numpy(observations)) actions = ptu.from_numpy(actions) advantages = ptu.from_numpy(advantages) @@ -137,7 +148,12 @@ def update(self, observations, actions, advantages, q_values=None): # HINT4: use self.optimizer to optimize the loss. Remember to # 'zero_grad' first - TODO + log_probs = self.forward(observations).log_prob(actions) + loss = -(log_probs * advantages).mean() + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() if self.nn_baseline: ## TODO: update the neural network baseline using the q_values as @@ -148,8 +164,15 @@ def update(self, observations, actions, advantages, q_values=None): ## updating the baseline. Remember to 'zero_grad' first ## HINT2: You will need to convert the targets into a tensor using ## ptu.from_numpy before using it in the loss + assert isinstance(q_values, np.ndarray) and isinstance(self.baseline, nn.Module) + q_values = (q_values - q_values.mean()) / q_values.std() + q_values = ptu.from_numpy(q_values) + baseline_v = self.baseline(observations).squeeze() + baseline_loss = self.baseline_loss(baseline_v, q_values) - TODO + self.baseline_optimizer.zero_grad() + baseline_loss.backward() + self.baseline_optimizer.step() train_log = { 'Training Loss': ptu.to_numpy(loss), @@ -166,6 +189,7 @@ def run_baseline_prediction(self, observations): Output: np.ndarray of size [N] """ + assert self.baseline is not None observations = ptu.from_numpy(observations) pred = self.baseline(observations) return ptu.to_numpy(pred.squeeze()) diff --git a/hw2/cs285/scripts/run_hw2.py b/hw2/cs285/scripts/run_hw2.py index 156b4b65..c6e9df5e 100644 --- a/hw2/cs285/scripts/run_hw2.py +++ b/hw2/cs285/scripts/run_hw2.py @@ -1,6 +1,11 @@ +import sys import os + +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) + import time +from pyvirtualdisplay.display import Display from cs285.infrastructure.rl_trainer import RL_Trainer from cs285.agents.pg_agent import PGAgent @@ -69,6 +74,7 @@ def main(): parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) + parser.add_argument('--num_gradient_steps_per_traj', type=int, default=1) parser.add_argument('--discount', type=float, default=1.0) parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) parser.add_argument('--n_layers', '-l', type=int, default=2) @@ -94,7 +100,7 @@ def main(): # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above) params['train_batch_size'] = params['batch_size'] -################################## + ################################## ### CREATE DIRECTORY FOR LOGGING ################################## @@ -111,6 +117,14 @@ def main(): if not(os.path.exists(logdir)): os.makedirs(logdir) + #################################### + ### Virtual display for rendering + #################################### + display = None + if params['video_log_freq'] > 0: + display = Display(visible=False, size=(400, 300), backend="xvfb") + display.start() + ################### ### RUN TRAINING ################### @@ -118,6 +132,9 @@ def main(): trainer = PG_Trainer(params) trainer.run_training_loop() + if display is not None: + display.stop() + if __name__ == "__main__": main()