From daf0d7cbbfa814f0d5b422d3123fbf5a61530b63 Mon Sep 17 00:00:00 2001
From: hyeok9855 <hyeok9855@gmail.com>
Date: Fri, 8 Sep 2023 15:32:56 +0900
Subject: [PATCH] hw2 solved

---
 hw2/cs285/agents/pg_agent.py           | 24 +++++++---
 hw2/cs285/infrastructure/rl_trainer.py | 59 ++++++++++++++++++++----
 hw2/cs285/infrastructure/utils.py      | 64 ++++++++++++++++++++++++--
 hw2/cs285/policies/MLP_policy.py       | 36 ++++++++++++---
 hw2/cs285/scripts/run_hw2.py           | 19 +++++++-
 5 files changed, 177 insertions(+), 25 deletions(-)

diff --git a/hw2/cs285/agents/pg_agent.py b/hw2/cs285/agents/pg_agent.py
index 5a135c20..dd5e66f4 100644
--- a/hw2/cs285/agents/pg_agent.py
+++ b/hw2/cs285/agents/pg_agent.py
@@ -45,7 +45,9 @@ def train(self, observations, actions, rewards_list, next_observations, terminal
         # HINT1: use helper functions to compute qvals and advantages
         # HINT2: look at the MLPPolicyPG class for how to update the policy
             # and obtain a train_log
-
+        qvals = self.calculate_q_vals(rewards_list)
+        advantages = self.estimate_advantage(observations, rewards_list, qvals, terminals)
+        train_log = self.actor.update(observations, actions, advantages, q_values=qvals if self.nn_baseline else None)
         return train_log
 
     def calculate_q_vals(self, rewards_list):
@@ -70,12 +72,12 @@ def calculate_q_vals(self, rewards_list):
         # ordering as observations, actions, etc.
 
         if not self.reward_to_go:
-            TODO
+            q_values = np.concatenate([self._discounted_return(sub_list) for sub_list in rewards_list], axis=0)
 
         # Case 2: reward-to-go PG
         # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t
         else:
-            TODO
+            q_values = np.concatenate([self._discounted_cumsum(sub_list) for sub_list in rewards_list], axis=0)
 
         return q_values
 
@@ -95,7 +97,7 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals):
             ## TODO: values were trained with standardized q_values, so ensure
                 ## that the predictions have the same mean and standard deviation as
                 ## the current batch of q_values
-            values = TODO
+            values = values_unnormalized * q_values.std() + q_values.mean()
 
             if self.gae_lambda is not None:
                 ## append a dummy T+1 value for simpler recursive calculation
@@ -117,13 +119,15 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals):
                         ## 0 otherwise.
                     ## HINT 2: self.gae_lambda is the lambda value in the
                         ## GAE formula
+                    delta = rews[i] + self.gamma * (values[i + 1] * (1 - terminals[i])) - values[i]
+                    advantages[i] = delta + self.gamma * self.gae_lambda * advantages[i + 1] * (1 - terminals[i])
 
                 # remove dummy advantage
                 advantages = advantages[:-1]
 
             else:
                 ## TODO: compute advantage estimates using q_values, and values as baselines
-                advantages = TODO
+                advantages = q_values - values
 
         # Else, just set the advantage to [Q]
         else:
@@ -133,7 +137,7 @@ def estimate_advantage(self, obs, rews_list, q_values, terminals):
         if self.standardize_advantages:
             ## TODO: standardize the advantages to have a mean of zero
             ## and a standard deviation of one
-            advantages = TODO
+            advantages = (advantages - advantages.mean()) / advantages.std()
 
         return advantages
 
@@ -160,6 +164,7 @@ def _discounted_return(self, rewards):
         """
 
         # TODO: create list_of_discounted_returns
+        list_of_discounted_returns = [sum((self.gamma ** _t) * _r for _t, _r in enumerate(rewards))] * len(rewards)
 
         return list_of_discounted_returns
 
@@ -173,5 +178,10 @@ def _discounted_cumsum(self, rewards):
         # TODO: create `list_of_discounted_returns`
         # HINT: it is possible to write a vectorized solution, but a solution
             # using a for loop is also fine
-
+        list_of_discounted_cumsums = []
+        sum_last = 0
+        for i in range(len(rewards)):
+            sum_last = sum_last * self.gamma + rewards[-(i + 1)]
+            list_of_discounted_cumsums.append(sum_last)
+        list_of_discounted_cumsums.reverse()
         return list_of_discounted_cumsums
diff --git a/hw2/cs285/infrastructure/rl_trainer.py b/hw2/cs285/infrastructure/rl_trainer.py
index 66c408e6..f3414933 100644
--- a/hw2/cs285/infrastructure/rl_trainer.py
+++ b/hw2/cs285/infrastructure/rl_trainer.py
@@ -5,6 +5,7 @@
 import time
 
 import gym
+from gym import spaces
 from gym import wrappers
 import numpy as np
 import torch
@@ -63,7 +64,7 @@ def __init__(self, params):
         MAX_VIDEO_LEN = self.params['ep_len']
 
         # Is this env continuous, or self.discrete?
-        discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
+        discrete = isinstance(self.env.action_space, spaces.Discrete)
         # Are the observations images?
         img = len(self.env.observation_space.shape) > 2
 
@@ -116,9 +117,9 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
 
             # decide if videos should be rendered/logged at this iteration
             if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1:
-                self.logvideo = True
+                self.log_video = True
             else:
-                self.logvideo = False
+                self.log_video = False
 
             # decide if metrics should be logged
             if self.params['scalar_log_freq'] == -1:
@@ -142,7 +143,7 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
             train_logs = self.train_agent()
 
             # log/save
-            if self.logvideo or self.logmetrics:
+            if self.log_video or self.logmetrics:
                 # perform logging
                 print('\nBeginning logging procedure...')
                 self.perform_logging(itr, paths, eval_policy, train_video_paths, train_logs)
@@ -153,11 +154,53 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
     ####################################
     ####################################
 
-    def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, batch_size):
-        # TODO: GETTHIS from HW1
+    def collect_training_trajectories(
+            self,
+            itr,
+            load_initial_expertdata,
+            collect_policy,
+            batch_size,
+    ):
+        """
+        :param itr:
+        :param load_initial_expertdata:  path to expert data pkl file
+        :param collect_policy:  the current policy using which we collect data
+        :param batch_size:  the number of transitions we collect
+        :return:
+            paths: a list trajectories
+            envsteps_this_batch: the sum over the numbers of environment steps in paths
+            train_video_paths: paths which also contain videos for visualization purposes
+        """
+        # collect `batch_size` samples to be used for training
+        print("\nCollecting data to be used for training...")
+        paths, envsteps_this_batch = None, 0
+        train_video_paths = None
+
+        paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'])
+
+        # collect more rollouts with the same policy, to be saved as videos in tensorboard
+        # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
+        if self.log_video:
+            print('\nCollecting train rollouts to be used for saving videos...')
+            train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)
+
+        return paths, envsteps_this_batch, train_video_paths
 
     def train_agent(self):
-        # TODO: GETTHIS from HW1
+        print('\nTraining agent using sampled data from replay buffer...')
+        all_logs = []
+        for train_step in range(self.params['num_agent_train_steps_per_iter']):
+            # sample some data from the data buffer
+            (
+                ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch
+            ) = self.agent.sample(self.params["train_batch_size"])
+
+            # use the sampled data to train an agent
+            for i in range(self.params['num_gradient_steps_per_traj']):
+                train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch)
+            # keep the agent's training log for debugging
+            all_logs.append(train_log)  # type: ignore
+        return all_logs
 
     ####################################
     ####################################
@@ -173,7 +216,7 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs):
         eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len'])
 
         # save eval rollouts as videos in tensorboard event file
-        if self.logvideo and train_video_paths != None:
+        if self.log_video and train_video_paths != None:
             print('\nCollecting video rollouts eval')
             eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)
 
diff --git a/hw2/cs285/infrastructure/utils.py b/hw2/cs285/infrastructure/utils.py
index d23b4d3e..96e9f6e7 100644
--- a/hw2/cs285/infrastructure/utils.py
+++ b/hw2/cs285/infrastructure/utils.py
@@ -55,13 +55,71 @@ def mean_squared_error(a, b):
 ############################################
 
 def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
-    # TODO: get this from hw1
+
+    # initialize env for the beginning of a new rollout
+    ob = env.reset() # HINT: should be the output of resetting the env
+
+    # init vars
+    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
+    steps = 0
+    while True:
+
+        # render image of the simulated env
+        if render:
+            if 'rgb_array' in render_mode:
+                # if hasattr(env, 'sim'):
+                #     image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
+                # else:
+                image_obs.append(env.render(mode=render_mode))
+            if 'human' in render_mode:
+                env.render(mode=render_mode)
+                time.sleep(env.model.opt.timestep)
+
+        # use the most recent ob to decide what to do
+        obs.append(ob)
+        ac = policy.get_action(ob) # HINT: query the policy's get_action function
+        ac = ac[0]
+        acs.append(ac)
+
+        # take that action and record results
+        ob, rew, done, _ = env.step(ac)
+
+        # record result of taking that action
+        steps += 1
+        next_obs.append(ob)
+        rewards.append(rew)
+
+        # rollout can end due to done, or due to max_path_length
+        rollout_done = 1 if done or steps >= max_path_length else 0
+        terminals.append(rollout_done)
+
+        if rollout_done:
+            break
+
+    return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 
 def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
-    # TODO: get this from hw1
+    """
+        Collect rollouts until we have collected min_timesteps_per_batch steps.
+    """
+    paths = []
+    timesteps_this_batch = 0
+    while timesteps_this_batch < min_timesteps_per_batch:
+        path = sample_trajectory(env, policy, max_path_length, render, render_mode)
+        paths.append(path)
+        timesteps_this_batch += get_pathlength(path)
+
+    return paths, timesteps_this_batch
 
 def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
-    # TODO: get this from hw1
+    """
+        Collect ntraj rollouts.
+    """
+    paths = []
+    for _ in range(ntraj):
+        paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode))
+
+    return paths
 
 ############################################
 ############################################
diff --git a/hw2/cs285/policies/MLP_policy.py b/hw2/cs285/policies/MLP_policy.py
index c6f3b463..7ae46ccc 100644
--- a/hw2/cs285/policies/MLP_policy.py
+++ b/hw2/cs285/policies/MLP_policy.py
@@ -1,5 +1,6 @@
 import abc
 import itertools
+from typing import cast
 from torch import nn
 from torch.nn import functional as F
 from torch import optim
@@ -52,7 +53,7 @@ def __init__(self,
             self.mean_net = ptu.build_mlp(input_size=self.ob_dim,
                                       output_size=self.ac_dim,
                                       n_layers=self.n_layers, size=self.size)
-            self.logstd = nn.Parameter(
+            self.logstd = nn.parameter.Parameter(
                 torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
             )
             self.mean_net.to(ptu.device)
@@ -86,7 +87,15 @@ def save(self, filepath):
 
     # query the policy with observation(s) to get selected action(s)
     def get_action(self, obs: np.ndarray) -> np.ndarray:
-        # TODO: get this from HW1
+        if len(obs.shape) > 1:
+            observation = obs
+        else:
+            observation = obs[None]
+
+        observation = ptu.from_numpy(observation.astype(np.float32))
+
+        distn = self(observation)
+        return ptu.to_numpy(distn.sample())
 
     # update/train this policy
     def update(self, observations, actions, **kwargs):
@@ -97,12 +106,14 @@ def update(self, observations, actions, **kwargs):
     # through it. For example, you can return a torch.FloatTensor. You can also
     # return more flexible objects, such as a
     # `torch.distributions.Distribution` object. It's up to you!
-    def forward(self, observation: torch.FloatTensor):
+    def forward(self, observation: torch.FloatTensor) -> torch.distributions.Distribution:
         if self.discrete:
+            assert self.logits_na is not None
             logits = self.logits_na(observation)
             action_distribution = distributions.Categorical(logits=logits)
             return action_distribution
         else:
+            assert self.mean_net is not None and self.logstd is not None
             batch_mean = self.mean_net(observation)
             scale_tril = torch.diag(torch.exp(self.logstd))
             batch_dim = batch_mean.shape[0]
@@ -123,7 +134,7 @@ def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs):
         self.baseline_loss = nn.MSELoss()
 
     def update(self, observations, actions, advantages, q_values=None):
-        observations = ptu.from_numpy(observations)
+        observations = cast(torch.FloatTensor, ptu.from_numpy(observations))
         actions = ptu.from_numpy(actions)
         advantages = ptu.from_numpy(advantages)
 
@@ -137,7 +148,12 @@ def update(self, observations, actions, advantages, q_values=None):
         # HINT4: use self.optimizer to optimize the loss. Remember to
             # 'zero_grad' first
 
-        TODO
+        log_probs = self.forward(observations).log_prob(actions)
+        loss = -(log_probs * advantages).mean()
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
 
         if self.nn_baseline:
             ## TODO: update the neural network baseline using the q_values as
@@ -148,8 +164,15 @@ def update(self, observations, actions, advantages, q_values=None):
                 ## updating the baseline. Remember to 'zero_grad' first
             ## HINT2: You will need to convert the targets into a tensor using
                 ## ptu.from_numpy before using it in the loss
+            assert isinstance(q_values, np.ndarray) and isinstance(self.baseline, nn.Module)
+            q_values = (q_values - q_values.mean()) / q_values.std()
+            q_values = ptu.from_numpy(q_values)
+            baseline_v = self.baseline(observations).squeeze()
+            baseline_loss = self.baseline_loss(baseline_v, q_values)
 
-            TODO
+            self.baseline_optimizer.zero_grad()
+            baseline_loss.backward()
+            self.baseline_optimizer.step()
 
         train_log = {
             'Training Loss': ptu.to_numpy(loss),
@@ -166,6 +189,7 @@ def run_baseline_prediction(self, observations):
             Output: np.ndarray of size [N]
 
         """
+        assert self.baseline is not None
         observations = ptu.from_numpy(observations)
         pred = self.baseline(observations)
         return ptu.to_numpy(pred.squeeze())
diff --git a/hw2/cs285/scripts/run_hw2.py b/hw2/cs285/scripts/run_hw2.py
index 156b4b65..c6e9df5e 100644
--- a/hw2/cs285/scripts/run_hw2.py
+++ b/hw2/cs285/scripts/run_hw2.py
@@ -1,6 +1,11 @@
+import sys
 import os
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
+
 import time
 
+from pyvirtualdisplay.display import Display
 from cs285.infrastructure.rl_trainer import RL_Trainer
 from cs285.agents.pg_agent import PGAgent
 
@@ -69,6 +74,7 @@ def main():
     parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step
 
     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
+    parser.add_argument('--num_gradient_steps_per_traj', type=int, default=1)
     parser.add_argument('--discount', type=float, default=1.0)
     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
     parser.add_argument('--n_layers', '-l', type=int, default=2)
@@ -94,7 +100,7 @@ def main():
     # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above)
     params['train_batch_size'] = params['batch_size']
 
-##################################
+    ##################################
     ### CREATE DIRECTORY FOR LOGGING
     ##################################
 
@@ -111,6 +117,14 @@ def main():
     if not(os.path.exists(logdir)):
         os.makedirs(logdir)
 
+    ####################################
+    ### Virtual display for rendering
+    ####################################
+    display = None
+    if params['video_log_freq'] > 0:
+        display = Display(visible=False, size=(400, 300), backend="xvfb")
+        display.start()
+
     ###################
     ### RUN TRAINING
     ###################
@@ -118,6 +132,9 @@ def main():
     trainer = PG_Trainer(params)
     trainer.run_training_loop()
 
+    if display is not None:
+        display.stop()
+
 
 if __name__ == "__main__":
     main()