diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..66a5d065
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+data/
+*.DS_Store
+*~
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..0cef3f05
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/).
diff --git a/hw1/README.md b/hw1/README.md
new file mode 100644
index 00000000..83d903ca
--- /dev/null
+++ b/hw1/README.md
@@ -0,0 +1,77 @@
+## Setup
+
+You can run this code on your own machine or on Google Colab.
+
+1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](installation.md) for instructions.
+2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below:
+
+[data:image/s3,"s3://crabby-images/e7985/e79852128a5f83c92496b9d734ca52d01e009a39" alt="Open In Colab"](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2021/blob/master/hw1/cs285/scripts/run_hw1.ipynb)
+
+## Complete the code
+
+Fill in sections marked with `TODO`. In particular, see
+ - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
+ - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
+ - [infrastructure/replay_buffer.py](cs285/infrastructure/replay_buffer.py)
+ - [infrastructure/utils.py](cs285/infrastructure/utils.py)
+ - [infrastructure/pytorch_util.py](cs285/infrastructure/pytorch_util.py)
+
+Look for sections maked with `HW1` to see how the edits you make will be used.
+Some other files that you may find relevant
+ - [scripts/run_hw1.py](cs285/scripts/run_hw1.py) (if running locally) or [scripts/run_hw1.ipynb](cs285/scripts/run_hw1.ipynb) (if running on Colab)
+ - [agents/bc_agent.py](cs285/agents/bc_agent.py)
+
+See the homework pdf for more details.
+
+## Run the code
+
+Tip: While debugging, you probably want to keep the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. However, feel free to remove it to save videos of your awesome policy!
+
+If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above.
+
+### Section 1 (Behavior Cloning)
+Command for problem 1:
+
+```
+python cs285/scripts/run_hw1.py \
+ --expert_policy_file cs285/policies/experts/Ant.pkl \
+ --env_name Ant-v2 --exp_name bc_ant --n_iter 1 \
+ --expert_data cs285/expert_data/expert_data_Ant-v2.pkl
+ --video_log_freq -1
+```
+
+Make sure to also try another environment.
+See the homework PDF for more details on what else you need to run.
+To generate videos of the policy, remove the `--video_log_freq -1` flag.
+
+### Section 2 (DAgger)
+Command for section 1:
+(Note the `--do_dagger` flag, and the higher value for `n_iter`)
+
+```
+python cs285/scripts/run_hw1.py \
+ --expert_policy_file cs285/policies/experts/Ant.pkl \
+ --env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \
+ --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
+ --video_log_freq -1
+```
+
+Make sure to also try another environment.
+See the homework PDF for more details on what else you need to run.
+
+## Visualization the saved tensorboard event file:
+
+You can visualize your runs using tensorboard:
+```
+tensorboard --logdir data
+```
+
+You will see scalar summaries as well as videos of your trained policies (in the 'images' tab).
+
+You can choose to visualize specific runs with a comma-separated list:
+```
+tensorboard --logdir data/run1,data/run2,data/run3...
+```
+
+If running on Colab, you will be using the `%tensorboard` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) to do the same thing; see the [notebook](cs285/scripts/run_hw1.ipynb) for more details.
+
diff --git a/hw1/cs285/agents/__init__.py b/hw1/cs285/agents/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hw1/cs285/agents/base_agent.py b/hw1/cs285/agents/base_agent.py
new file mode 100644
index 00000000..d7712a05
--- /dev/null
+++ b/hw1/cs285/agents/base_agent.py
@@ -0,0 +1,17 @@
+
+class BaseAgent(object):
+ def __init__(self, **kwargs):
+ super(BaseAgent, self).__init__(**kwargs)
+
+ def train(self) -> dict:
+ """Return a dictionary of logging information."""
+ raise NotImplementedError
+
+ def add_to_replay_buffer(self, paths):
+ raise NotImplementedError
+
+ def sample(self, batch_size):
+ raise NotImplementedError
+
+ def save(self, path):
+ raise NotImplementedError
diff --git a/hw1/cs285/agents/bc_agent.py b/hw1/cs285/agents/bc_agent.py
new file mode 100644
index 00000000..b7ad366e
--- /dev/null
+++ b/hw1/cs285/agents/bc_agent.py
@@ -0,0 +1,40 @@
+from cs285.infrastructure.replay_buffer import ReplayBuffer
+from cs285.policies.MLP_policy import MLPPolicySL
+from .base_agent import BaseAgent
+
+
+class BCAgent(BaseAgent):
+ def __init__(self, env, agent_params):
+ super(BCAgent, self).__init__()
+
+ # init vars
+ self.env = env
+ self.agent_params = agent_params
+
+ # actor/policy
+ self.actor = MLPPolicySL(
+ self.agent_params['ac_dim'],
+ self.agent_params['ob_dim'],
+ self.agent_params['n_layers'],
+ self.agent_params['size'],
+ discrete=self.agent_params['discrete'],
+ learning_rate=self.agent_params['learning_rate'],
+ )
+
+ # replay buffer
+ self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
+
+ def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
+ # training a BC agent refers to updating its actor using
+ # the given observations and corresponding action labels
+ log = self.actor.update(ob_no, ac_na) # HW1: you will modify this
+ return log
+
+ def add_to_replay_buffer(self, paths):
+ self.replay_buffer.add_rollouts(paths)
+
+ def sample(self, batch_size):
+ return self.replay_buffer.sample_random_data(batch_size) # HW1: you will modify this
+
+ def save(self, path):
+ return self.actor.save(path)
\ No newline at end of file
diff --git a/hw1/cs285/expert_data/expert_data_Ant-v2.pkl b/hw1/cs285/expert_data/expert_data_Ant-v2.pkl
new file mode 100644
index 00000000..d50d7a93
Binary files /dev/null and b/hw1/cs285/expert_data/expert_data_Ant-v2.pkl differ
diff --git a/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl b/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl
new file mode 100644
index 00000000..e661446b
Binary files /dev/null and b/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl differ
diff --git a/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl b/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl
new file mode 100644
index 00000000..6784546b
Binary files /dev/null and b/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl differ
diff --git a/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl b/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl
new file mode 100644
index 00000000..9236099a
Binary files /dev/null and b/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl differ
diff --git a/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl b/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl
new file mode 100644
index 00000000..a0ec3a1e
Binary files /dev/null and b/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl differ
diff --git a/hw1/cs285/infrastructure/__init__.py b/hw1/cs285/infrastructure/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hw1/cs285/infrastructure/colab_utils.py b/hw1/cs285/infrastructure/colab_utils.py
new file mode 100644
index 00000000..31ab6d9e
--- /dev/null
+++ b/hw1/cs285/infrastructure/colab_utils.py
@@ -0,0 +1,26 @@
+from gym.wrappers import Monitor
+import glob
+import io
+import base64
+from IPython.display import HTML
+from IPython import display as ipythondisplay
+
+## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
+
+def show_video():
+ mp4list = glob.glob('/content/video/*.mp4')
+ if len(mp4list) > 0:
+ mp4 = mp4list[0]
+ video = io.open(mp4, 'r+b').read()
+ encoded = base64.b64encode(video)
+ ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
+ else:
+ print("Could not find video")
+
+
+def wrap_env(env):
+ env = Monitor(env, '/content/video', force=True)
+ return env
\ No newline at end of file
diff --git a/hw1/cs285/infrastructure/logger.py b/hw1/cs285/infrastructure/logger.py
new file mode 100644
index 00000000..a64931c0
--- /dev/null
+++ b/hw1/cs285/infrastructure/logger.py
@@ -0,0 +1,74 @@
+import os
+from tensorboardX import SummaryWriter
+import numpy as np
+
+class Logger:
+ def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
+ self._log_dir = log_dir
+ print('########################')
+ print('logging outputs to ', log_dir)
+ print('########################')
+ self._n_logged_samples = n_logged_samples
+ self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
+
+ def log_scalar(self, scalar, name, step_):
+ self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
+
+ def log_scalars(self, scalar_dict, group_name, step, phase):
+ """Will log all scalars in the same plot."""
+ self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
+
+ def log_image(self, image, name, step):
+ assert(len(image.shape) == 3) # [C, H, W]
+ self._summ_writer.add_image('{}'.format(name), image, step)
+
+ def log_video(self, video_frames, name, step, fps=10):
+ assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
+ self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
+
+ def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
+
+ # reshape the rollouts
+ videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
+
+ # max rollout length
+ max_videos_to_save = np.min([max_videos_to_save, len(videos)])
+ max_length = videos[0].shape[0]
+ for i in range(max_videos_to_save):
+ if videos[i].shape[0]>max_length:
+ max_length = videos[i].shape[0]
+
+ # pad rollouts to all be same length
+ for i in range(max_videos_to_save):
+ if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!"
+ self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
+
+ def log_figure(self, figure, name, step, phase):
+ """figure: matplotlib.pyplot figure handle"""
+ self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
+
+ def log_graph(self, array, name, step, phase):
+ """figure: matplotlib.pyplot figure handle"""
+ im = plot_graph(array)
+ self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
+
+ def dump_scalars(self, log_path=None):
+ log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
+ self._summ_writer.export_scalars_to_json(log_path)
+
+ def flush(self):
+ self._summ_writer.flush()
+
+
+
+
diff --git a/hw1/cs285/infrastructure/pytorch_util.py b/hw1/cs285/infrastructure/pytorch_util.py
new file mode 100644
index 00000000..bc7a4081
--- /dev/null
+++ b/hw1/cs285/infrastructure/pytorch_util.py
@@ -0,0 +1,75 @@
+from typing import Union
+
+import torch
+from torch import nn
+
+Activation = Union[str, nn.Module]
+
+
+_str_to_activation = {
+ 'relu': nn.ReLU(),
+ 'tanh': nn.Tanh(),
+ 'leaky_relu': nn.LeakyReLU(),
+ 'sigmoid': nn.Sigmoid(),
+ 'selu': nn.SELU(),
+ 'softplus': nn.Softplus(),
+ 'identity': nn.Identity(),
+}
+
+
+def build_mlp(
+ input_size: int,
+ output_size: int,
+ n_layers: int,
+ size: int,
+ activation: Activation = 'tanh',
+ output_activation: Activation = 'identity',
+) -> nn.Module:
+ """
+ Builds a feedforward neural network
+
+ arguments:
+ n_layers: number of hidden layers
+ size: dimension of each hidden layer
+ activation: activation of each hidden layer
+
+ input_size: size of the input layer
+ output_size: size of the output layer
+ output_activation: activation of the output layer
+
+ returns:
+ MLP (nn.Module)
+ """
+ if isinstance(activation, str):
+ activation = _str_to_activation[activation]
+ if isinstance(output_activation, str):
+ output_activation = _str_to_activation[output_activation]
+
+ # TODO: return a MLP. This should be an instance of nn.Module
+ # Note: nn.Sequential is an instance of nn.Module.
+ raise NotImplementedError
+
+
+device = None
+
+
+def init_gpu(use_gpu=True, gpu_id=0):
+ global device
+ if torch.cuda.is_available() and use_gpu:
+ device = torch.device("cuda:" + str(gpu_id))
+ print("Using GPU id {}".format(gpu_id))
+ else:
+ device = torch.device("cpu")
+ print("GPU not detected. Defaulting to CPU.")
+
+
+def set_device(gpu_id):
+ torch.cuda.set_device(gpu_id)
+
+
+def from_numpy(*args, **kwargs):
+ return torch.from_numpy(*args, **kwargs).float().to(device)
+
+
+def to_numpy(tensor):
+ return tensor.to('cpu').detach().numpy()
diff --git a/hw1/cs285/infrastructure/replay_buffer.py b/hw1/cs285/infrastructure/replay_buffer.py
new file mode 100644
index 00000000..60148e79
--- /dev/null
+++ b/hw1/cs285/infrastructure/replay_buffer.py
@@ -0,0 +1,89 @@
+from cs285.infrastructure.utils import *
+
+
+class ReplayBuffer(object):
+
+ def __init__(self, max_size=1000000):
+
+ self.max_size = max_size
+
+ # store each rollout
+ self.paths = []
+
+ # store (concatenated) component arrays from each rollout
+ self.obs = None
+ self.acs = None
+ self.rews = None
+ self.next_obs = None
+ self.terminals = None
+
+ def __len__(self):
+ if self.obs:
+ return self.obs.shape[0]
+ else:
+ return 0
+
+ def add_rollouts(self, paths, concat_rew=True):
+
+ # add new rollouts into our list of rollouts
+ for path in paths:
+ self.paths.append(path)
+
+ # convert new rollouts into their component arrays, and append them onto
+ # our arrays
+ observations, actions, rewards, next_observations, terminals = (
+ convert_listofrollouts(paths, concat_rew))
+
+ if self.obs is None:
+ self.obs = observations[-self.max_size:]
+ self.acs = actions[-self.max_size:]
+ self.rews = rewards[-self.max_size:]
+ self.next_obs = next_observations[-self.max_size:]
+ self.terminals = terminals[-self.max_size:]
+ else:
+ self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
+ self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
+ if concat_rew:
+ self.rews = np.concatenate(
+ [self.rews, rewards]
+ )[-self.max_size:]
+ else:
+ if isinstance(rewards, list):
+ self.rews += rewards
+ else:
+ self.rews.append(rewards)
+ self.rews = self.rews[-self.max_size:]
+ self.next_obs = np.concatenate(
+ [self.next_obs, next_observations]
+ )[-self.max_size:]
+ self.terminals = np.concatenate(
+ [self.terminals, terminals]
+ )[-self.max_size:]
+
+ ########################################
+ ########################################
+
+ def sample_random_data(self, batch_size):
+ assert (
+ self.obs.shape[0]
+ == self.acs.shape[0]
+ == self.rews.shape[0]
+ == self.next_obs.shape[0]
+ == self.terminals.shape[0]
+ )
+
+ ## TODO return batch_size number of random entries from each of the 5 component arrays above
+ ## HINT 1: use np.random.permutation to sample random indices
+ ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
+ ## HINT 3: look at the sample_recent_data function below
+
+ return TODO, TODO, TODO, TODO, TODO
+
+ def sample_recent_data(self, batch_size=1):
+ return (
+ self.obs[-batch_size:],
+ self.acs[-batch_size:],
+ self.rews[-batch_size:],
+ self.next_obs[-batch_size:],
+ self.terminals[-batch_size:],
+ )
diff --git a/hw1/cs285/infrastructure/rl_trainer.py b/hw1/cs285/infrastructure/rl_trainer.py
new file mode 100644
index 00000000..bb27972e
--- /dev/null
+++ b/hw1/cs285/infrastructure/rl_trainer.py
@@ -0,0 +1,269 @@
+from collections import OrderedDict
+import numpy as np
+import time
+
+import gym
+import torch
+
+from cs285.infrastructure import pytorch_util as ptu
+from cs285.infrastructure.logger import Logger
+from cs285.infrastructure import utils
+
+# how many rollouts to save as videos to tensorboard
+MAX_NVIDEO = 2
+MAX_VIDEO_LEN = 40 # we overwrite this in the code below
+
+
+class RL_Trainer(object):
+
+ def __init__(self, params):
+
+ #############
+ ## INIT
+ #############
+
+ # Get params, create logger, create TF session
+ self.params = params
+ self.logger = Logger(self.params['logdir'])
+
+ # Set random seeds
+ seed = self.params['seed']
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ ptu.init_gpu(
+ use_gpu=not self.params['no_gpu'],
+ gpu_id=self.params['which_gpu']
+ )
+
+ #############
+ ## ENV
+ #############
+
+ # Make the gym environment
+ self.env = gym.make(self.params['env_name'])
+ self.env.seed(seed)
+
+ # Maximum length for episodes
+ self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps
+ MAX_VIDEO_LEN = self.params['ep_len']
+
+ # Is this env continuous, or self.discrete?
+ discrete = isinstance(self.env.action_space, gym.spaces.Discrete)
+ self.params['agent_params']['discrete'] = discrete
+
+ # Observation and action sizes
+ ob_dim = self.env.observation_space.shape[0]
+ ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0]
+ self.params['agent_params']['ac_dim'] = ac_dim
+ self.params['agent_params']['ob_dim'] = ob_dim
+
+ # simulation timestep, will be used for video saving
+ if 'model' in dir(self.env):
+ self.fps = 1/self.env.model.opt.timestep
+ else:
+ self.fps = self.env.env.metadata['video.frames_per_second']
+
+ #############
+ ## AGENT
+ #############
+
+ agent_class = self.params['agent_class']
+ self.agent = agent_class(self.env, self.params['agent_params'])
+
+ def run_training_loop(self, n_iter, collect_policy, eval_policy,
+ initial_expertdata=None, relabel_with_expert=False,
+ start_relabel_with_expert=1, expert_policy=None):
+ """
+ :param n_iter: number of (dagger) iterations
+ :param collect_policy:
+ :param eval_policy:
+ :param initial_expertdata:
+ :param relabel_with_expert: whether to perform dagger
+ :param start_relabel_with_expert: iteration at which to start relabel with expert
+ :param expert_policy:
+ """
+
+ # init vars at beginning of training
+ self.total_envsteps = 0
+ self.start_time = time.time()
+
+ for itr in range(n_iter):
+ print("\n\n********** Iteration %i ************"%itr)
+
+ # decide if videos should be rendered/logged at this iteration
+ if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1:
+ self.log_video = True
+ else:
+ self.log_video = False
+
+ # decide if metrics should be logged
+ if itr % self.params['scalar_log_freq'] == 0:
+ self.log_metrics = True
+ else:
+ self.log_metrics = False
+
+ # collect trajectories, to be used for training
+ training_returns = self.collect_training_trajectories(
+ itr,
+ initial_expertdata,
+ collect_policy,
+ self.params['batch_size']
+ ) # HW1: implement this function below
+ paths, envsteps_this_batch, train_video_paths = training_returns
+ self.total_envsteps += envsteps_this_batch
+
+ # relabel the collected obs with actions from a provided expert policy
+ if relabel_with_expert and itr>=start_relabel_with_expert:
+ paths = self.do_relabel_with_expert(expert_policy, paths) # HW1: implement this function below
+
+ # add collected data to replay buffer
+ self.agent.add_to_replay_buffer(paths)
+
+ # train agent (using sampled data from replay buffer)
+ training_logs = self.train_agent() # HW1: implement this function below
+
+ # log/save
+ if self.log_video or self.log_metrics:
+
+ # perform logging
+ print('\nBeginning logging procedure...')
+ self.perform_logging(
+ itr, paths, eval_policy, train_video_paths, training_logs)
+
+ if self.params['save_params']:
+ print('\nSaving agent params')
+ self.agent.save('{}/policy_itr_{}.pt'.format(self.params['logdir'], itr))
+
+ ####################################
+ ####################################
+
+ def collect_training_trajectories(
+ self,
+ itr,
+ load_initial_expertdata,
+ collect_policy,
+ batch_size,
+ ):
+ """
+ :param itr:
+ :param load_initial_expertdata: path to expert data pkl file
+ :param collect_policy: the current policy using which we collect data
+ :param batch_size: the number of transitions we collect
+ :return:
+ paths: a list trajectories
+ envsteps_this_batch: the sum over the numbers of environment steps in paths
+ train_video_paths: paths which also contain videos for visualization purposes
+ """
+
+ # TODO decide whether to load training data or use the current policy to collect more data
+ # HINT: depending on if it's the first iteration or not, decide whether to either
+ # (1) load the data. In this case you can directly return as follows
+ # ``` return loaded_paths, 0, None ```
+
+ # (2) collect `self.params['batch_size']` transitions
+
+ # TODO collect `batch_size` samples to be used for training
+ # HINT1: use sample_trajectories from utils
+ # HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
+ print("\nCollecting data to be used for training...")
+ paths, envsteps_this_batch = TODO
+
+ # collect more rollouts with the same policy, to be saved as videos in tensorboard
+ # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
+ train_video_paths = None
+ if self.log_video:
+ print('\nCollecting train rollouts to be used for saving videos...')
+ ## TODO look in utils and implement sample_n_trajectories
+ train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)
+
+ return paths, envsteps_this_batch, train_video_paths
+
+
+ def train_agent(self):
+ print('\nTraining agent using sampled data from replay buffer...')
+ all_logs = []
+ for train_step in range(self.params['num_agent_train_steps_per_iter']):
+
+ # TODO sample some data from the data buffer
+ # HINT1: use the agent's sample function
+ # HINT2: how much data = self.params['train_batch_size']
+ ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = TODO
+
+ # TODO use the sampled data to train an agent
+ # HINT: use the agent's train function
+ # HINT: keep the agent's training log for debugging
+ train_log = TODO
+ all_logs.append(train_log)
+ return all_logs
+
+ def do_relabel_with_expert(self, expert_policy, paths):
+ print("\nRelabelling collected observations with labels from an expert policy...")
+
+ # TODO relabel collected obsevations (from our policy) with labels from an expert policy
+ # HINT: query the policy (using the get_action function) with paths[i]["observation"]
+ # and replace paths[i]["action"] with these expert labels
+
+ return paths
+
+ ####################################
+ ####################################
+
+ def perform_logging(self, itr, paths, eval_policy, train_video_paths, training_logs):
+
+ # collect eval trajectories, for logging
+ print("\nCollecting data for eval...")
+ eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len'])
+
+ # save eval rollouts as videos in tensorboard event file
+ if self.log_video and train_video_paths != None:
+ print('\nCollecting video rollouts eval')
+ eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)
+
+ #save train/eval videos
+ print('\nSaving train rollouts as videos...')
+ self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO,
+ video_title='train_rollouts')
+ self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO,
+ video_title='eval_rollouts')
+
+ # save eval metrics
+ if self.log_metrics:
+ # returns, for logging
+ train_returns = [path["reward"].sum() for path in paths]
+ eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths]
+
+ # episode lengths, for logging
+ train_ep_lens = [len(path["reward"]) for path in paths]
+ eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths]
+
+ # decide what to log
+ logs = OrderedDict()
+ logs["Eval_AverageReturn"] = np.mean(eval_returns)
+ logs["Eval_StdReturn"] = np.std(eval_returns)
+ logs["Eval_MaxReturn"] = np.max(eval_returns)
+ logs["Eval_MinReturn"] = np.min(eval_returns)
+ logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens)
+
+ logs["Train_AverageReturn"] = np.mean(train_returns)
+ logs["Train_StdReturn"] = np.std(train_returns)
+ logs["Train_MaxReturn"] = np.max(train_returns)
+ logs["Train_MinReturn"] = np.min(train_returns)
+ logs["Train_AverageEpLen"] = np.mean(train_ep_lens)
+
+ logs["Train_EnvstepsSoFar"] = self.total_envsteps
+ logs["TimeSinceStart"] = time.time() - self.start_time
+ last_log = training_logs[-1] # Only use the last log for now
+ logs.update(last_log)
+
+
+ if itr == 0:
+ self.initial_return = np.mean(train_returns)
+ logs["Initial_DataCollection_AverageReturn"] = self.initial_return
+
+ # perform the logging
+ for key, value in logs.items():
+ print('{} : {}'.format(key, value))
+ self.logger.log_scalar(value, key, itr)
+ print('Done logging...\n\n')
+
+ self.logger.flush()
diff --git a/hw1/cs285/infrastructure/utils.py b/hw1/cs285/infrastructure/utils.py
new file mode 100644
index 00000000..d894480b
--- /dev/null
+++ b/hw1/cs285/infrastructure/utils.py
@@ -0,0 +1,119 @@
+import numpy as np
+import time
+
+############################################
+############################################
+
+def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
+
+ # initialize env for the beginning of a new rollout
+ ob = TODO # HINT: should be the output of resetting the env
+
+ # init vars
+ obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
+ steps = 0
+ while True:
+
+ # render image of the simulated env
+ if render:
+ if 'rgb_array' in render_mode:
+ if hasattr(env, 'sim'):
+ image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
+ else:
+ image_obs.append(env.render(mode=render_mode))
+ if 'human' in render_mode:
+ env.render(mode=render_mode)
+ time.sleep(env.model.opt.timestep)
+
+ # use the most recent ob to decide what to do
+ obs.append(ob)
+ ac = TODO # HINT: query the policy's get_action function
+ ac = ac[0]
+ acs.append(ac)
+
+ # take that action and record results
+ ob, rew, done, _ = env.step(ac)
+
+ # record result of taking that action
+ steps += 1
+ next_obs.append(ob)
+ rewards.append(rew)
+
+ # TODO end the rollout if the rollout ended
+ # HINT: rollout can end due to done, or due to max_path_length
+ rollout_done = TODO # HINT: this is either 0 or 1
+ terminals.append(rollout_done)
+
+ if rollout_done:
+ break
+
+ return Path(obs, image_obs, acs, rewards, next_obs, terminals)
+
+def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
+ """
+ Collect rollouts until we have collected min_timesteps_per_batch steps.
+
+ TODO implement this function
+ Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
+ Hint2: use get_pathlength to count the timesteps collected in each path
+ """
+ timesteps_this_batch = 0
+ paths = []
+ while timesteps_this_batch < min_timesteps_per_batch:
+
+ TODO
+
+ return paths, timesteps_this_batch
+
+def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
+ """
+ Collect ntraj rollouts.
+
+ TODO implement this function
+ Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
+ """
+ paths = []
+
+ TODO
+
+ return paths
+
+############################################
+############################################
+
+def Path(obs, image_obs, acs, rewards, next_obs, terminals):
+ """
+ Take info (separate arrays) from a single rollout
+ and return it in a single dictionary
+ """
+ if image_obs != []:
+ image_obs = np.stack(image_obs, axis=0)
+ return {"observation" : np.array(obs, dtype=np.float32),
+ "image_obs" : np.array(image_obs, dtype=np.uint8),
+ "reward" : np.array(rewards, dtype=np.float32),
+ "action" : np.array(acs, dtype=np.float32),
+ "next_observation": np.array(next_obs, dtype=np.float32),
+ "terminal": np.array(terminals, dtype=np.float32)}
+
+
+def convert_listofrollouts(paths, concat_rew=True):
+ """
+ Take a list of rollout dictionaries
+ and return separate arrays,
+ where each array is a concatenation of that array from across the rollouts
+ """
+ observations = np.concatenate([path["observation"] for path in paths])
+ actions = np.concatenate([path["action"] for path in paths])
+ if concat_rew:
+ rewards = np.concatenate([path["reward"] for path in paths])
+ else:
+ rewards = [path["reward"] for path in paths]
+ next_observations = np.concatenate([path["next_observation"] for path in paths])
+ terminals = np.concatenate([path["terminal"] for path in paths])
+ return observations, actions, rewards, next_observations, terminals
+
+############################################
+############################################
+
+def get_pathlength(path):
+ return len(path["reward"])
\ No newline at end of file
diff --git a/hw1/cs285/policies/MLP_policy.py b/hw1/cs285/policies/MLP_policy.py
new file mode 100644
index 00000000..c8e1fd7d
--- /dev/null
+++ b/hw1/cs285/policies/MLP_policy.py
@@ -0,0 +1,116 @@
+import abc
+import itertools
+from typing import Any
+from torch import nn
+from torch.nn import functional as F
+from torch import optim
+
+import numpy as np
+import torch
+from torch import distributions
+
+from cs285.infrastructure import pytorch_util as ptu
+from cs285.policies.base_policy import BasePolicy
+
+
+class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
+
+ def __init__(self,
+ ac_dim,
+ ob_dim,
+ n_layers,
+ size,
+ discrete=False,
+ learning_rate=1e-4,
+ training=True,
+ nn_baseline=False,
+ **kwargs
+ ):
+ super().__init__(**kwargs)
+
+ # init vars
+ self.ac_dim = ac_dim
+ self.ob_dim = ob_dim
+ self.n_layers = n_layers
+ self.discrete = discrete
+ self.size = size
+ self.learning_rate = learning_rate
+ self.training = training
+ self.nn_baseline = nn_baseline
+
+ if self.discrete:
+ self.logits_na = ptu.build_mlp(
+ input_size=self.ob_dim,
+ output_size=self.ac_dim,
+ n_layers=self.n_layers,
+ size=self.size,
+ )
+ self.logits_na.to(ptu.device)
+ self.mean_net = None
+ self.logstd = None
+ self.optimizer = optim.Adam(self.logits_na.parameters(),
+ self.learning_rate)
+ else:
+ self.logits_na = None
+ self.mean_net = ptu.build_mlp(
+ input_size=self.ob_dim,
+ output_size=self.ac_dim,
+ n_layers=self.n_layers, size=self.size,
+ )
+ self.mean_net.to(ptu.device)
+ self.logstd = nn.Parameter(
+ torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
+ )
+ self.logstd.to(ptu.device)
+ self.optimizer = optim.Adam(
+ itertools.chain([self.logstd], self.mean_net.parameters()),
+ self.learning_rate
+ )
+
+ ##################################
+
+ def save(self, filepath):
+ torch.save(self.state_dict(), filepath)
+
+ ##################################
+
+ def get_action(self, obs: np.ndarray) -> np.ndarray:
+ if len(obs.shape) > 1:
+ observation = obs
+ else:
+ observation = obs[None]
+
+ # TODO return the action that the policy prescribes
+ raise NotImplementedError
+
+ # update/train this policy
+ def update(self, observations, actions, **kwargs):
+ raise NotImplementedError
+
+ # This function defines the forward pass of the network.
+ # You can return anything you want, but you should be able to differentiate
+ # through it. For example, you can return a torch.FloatTensor. You can also
+ # return more flexible objects, such as a
+ # `torch.distributions.Distribution` object. It's up to you!
+ def forward(self, observation: torch.FloatTensor) -> Any:
+ raise NotImplementedError
+
+
+#####################################################
+#####################################################
+
+class MLPPolicySL(MLPPolicy):
+ def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs):
+ super().__init__(ac_dim, ob_dim, n_layers, size, **kwargs)
+ self.loss = nn.MSELoss()
+
+ def update(
+ self, observations, actions,
+ adv_n=None, acs_labels_na=None, qvals=None
+ ):
+ # TODO: update the policy and return the loss
+ loss = TODO
+ return {
+ # You can add extra logging information here, but keep this line
+ 'Training Loss': ptu.to_numpy(loss),
+ }
diff --git a/hw1/cs285/policies/__init__.py b/hw1/cs285/policies/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hw1/cs285/policies/base_policy.py b/hw1/cs285/policies/base_policy.py
new file mode 100644
index 00000000..e089540a
--- /dev/null
+++ b/hw1/cs285/policies/base_policy.py
@@ -0,0 +1,14 @@
+import abc
+import numpy as np
+
+
+class BasePolicy(object, metaclass=abc.ABCMeta):
+ def get_action(self, obs: np.ndarray) -> np.ndarray:
+ raise NotImplementedError
+
+ def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
+ """Return a dictionary of logging information."""
+ raise NotImplementedError
+
+ def save(self, filepath: str):
+ raise NotImplementedError
diff --git a/hw1/cs285/policies/experts/Ant.pkl b/hw1/cs285/policies/experts/Ant.pkl
new file mode 100644
index 00000000..7b87dedb
Binary files /dev/null and b/hw1/cs285/policies/experts/Ant.pkl differ
diff --git a/hw1/cs285/policies/experts/HalfCheetah.pkl b/hw1/cs285/policies/experts/HalfCheetah.pkl
new file mode 100644
index 00000000..1834540e
Binary files /dev/null and b/hw1/cs285/policies/experts/HalfCheetah.pkl differ
diff --git a/hw1/cs285/policies/experts/Hopper.pkl b/hw1/cs285/policies/experts/Hopper.pkl
new file mode 100644
index 00000000..7b09de35
Binary files /dev/null and b/hw1/cs285/policies/experts/Hopper.pkl differ
diff --git a/hw1/cs285/policies/experts/Humanoid.pkl b/hw1/cs285/policies/experts/Humanoid.pkl
new file mode 100644
index 00000000..c3bffc8c
Binary files /dev/null and b/hw1/cs285/policies/experts/Humanoid.pkl differ
diff --git a/hw1/cs285/policies/experts/Walker2d.pkl b/hw1/cs285/policies/experts/Walker2d.pkl
new file mode 100644
index 00000000..19cbf45e
Binary files /dev/null and b/hw1/cs285/policies/experts/Walker2d.pkl differ
diff --git a/hw1/cs285/policies/loaded_gaussian_policy.py b/hw1/cs285/policies/loaded_gaussian_policy.py
new file mode 100644
index 00000000..720ec472
--- /dev/null
+++ b/hw1/cs285/policies/loaded_gaussian_policy.py
@@ -0,0 +1,105 @@
+import numpy as np
+
+from cs285.infrastructure import pytorch_util as ptu
+from .base_policy import BasePolicy
+from torch import nn
+import torch
+import pickle
+
+
+def create_linear_layer(W, b) -> nn.Linear:
+ out_features, in_features = W.shape
+ linear_layer = nn.Linear(
+ in_features,
+ out_features,
+ )
+ linear_layer.weight.data = ptu.from_numpy(W.T)
+ linear_layer.bias.data = ptu.from_numpy(b[0])
+ return linear_layer
+
+
+def read_layer(l):
+ assert list(l.keys()) == ['AffineLayer']
+ assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
+ return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer'][
+ 'b'].astype(np.float32)
+
+
+class LoadedGaussianPolicy(BasePolicy, nn.Module):
+ def __init__(self, filename, **kwargs):
+ super().__init__(**kwargs)
+
+ with open(filename, 'rb') as f:
+ data = pickle.loads(f.read())
+
+ self.nonlin_type = data['nonlin_type']
+ if self.nonlin_type == 'lrelu':
+ self.non_lin = nn.LeakyReLU(0.01)
+ elif self.nonlin_type == 'tanh':
+ self.non_lin = nn.Tanh()
+ else:
+ raise NotImplementedError()
+ policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
+
+ assert policy_type == 'GaussianPolicy', (
+ 'Policy type {} not supported'.format(policy_type)
+ )
+ self.policy_params = data[policy_type]
+
+ assert set(self.policy_params.keys()) == {
+ 'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'
+ }
+
+ # Build the policy. First, observation normalization.
+ assert list(self.policy_params['obsnorm'].keys()) == ['Standardizer']
+ obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D']
+ obsnorm_meansq = self.policy_params['obsnorm']['Standardizer'][
+ 'meansq_1_D']
+ obsnorm_stdev = np.sqrt(
+ np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
+ print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
+
+ self.obs_norm_mean = nn.Parameter(ptu.from_numpy(obsnorm_mean))
+ self.obs_norm_std = nn.Parameter(ptu.from_numpy(obsnorm_stdev))
+ self.hidden_layers = nn.ModuleList()
+
+ # Hidden layers next
+ assert list(self.policy_params['hidden'].keys()) == ['FeedforwardNet']
+ layer_params = self.policy_params['hidden']['FeedforwardNet']
+ for layer_name in sorted(layer_params.keys()):
+ l = layer_params[layer_name]
+ W, b = read_layer(l)
+ linear_layer = create_linear_layer(W, b)
+ self.hidden_layers.append(linear_layer)
+
+ # Output layer
+ W, b = read_layer(self.policy_params['out'])
+ self.output_layer = create_linear_layer(W, b)
+
+ def forward(self, obs):
+ normed_obs = (obs - self.obs_norm_mean) / (self.obs_norm_std + 1e-6)
+ h = normed_obs
+ for layer in self.hidden_layers:
+ h = layer(h)
+ h = self.non_lin(h)
+ return self.output_layer(h)
+
+ ##################################
+
+ def update(self, obs_no, acs_na, adv_n=None, acs_labels_na=None):
+ raise NotImplementedError("""
+ This policy class simply loads in a particular type of policy and
+ queries it. Do not try to train it.
+ """)
+
+ def get_action(self, obs):
+ if len(obs.shape) > 1:
+ observation = obs
+ else:
+ observation = obs[None, :]
+ observation = ptu.from_numpy(observation.astype(np.float32))
+ action = self(observation)
+ return ptu.to_numpy(action)
+
+ def save(self, filepath):
+ torch.save(self.state_dict(), filepath)
diff --git a/hw1/cs285/scripts/run_hw1.ipynb b/hw1/cs285/scripts/run_hw1.ipynb
new file mode 100644
index 00000000..a08180f5
--- /dev/null
+++ b/hw1/cs285/scripts/run_hw1.ipynb
@@ -0,0 +1,529 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "mLXw6zd-k3Xd"
+ },
+ "source": [
+ "##Setup\n",
+ "\n",
+ "You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File → Save a copy in Drive**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "4HBPnmbIPPyl"
+ },
+ "outputs": [],
+ "source": [
+ "#@title mount your Google Drive\n",
+ "#@markdown Your work will be stored in a folder called `cs285_f2021` by default to prevent Colab instance timeouts from deleting your edits.\n",
+ "\n",
+ "import os\n",
+ "from google.colab import drive\n",
+ "drive.mount('/content/gdrive')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "OuCfTLJIx5nQ"
+ },
+ "outputs": [],
+ "source": [
+ "#@title set up mount symlink\n",
+ "\n",
+ "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs285_f2021'\n",
+ "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n",
+ "if not os.path.exists(DRIVE_PYTHON_PATH):\n",
+ " %mkdir $DRIVE_PATH\n",
+ "\n",
+ "## the space in `My Drive` causes some issues,\n",
+ "## make a symlink to avoid this\n",
+ "SYM_PATH = '/content/cs285_f2021'\n",
+ "if not os.path.exists(SYM_PATH):\n",
+ " !ln -s $DRIVE_PATH $SYM_PATH"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "XTtWDO-Bkqnn"
+ },
+ "outputs": [],
+ "source": [
+ "#@title apt install requirements\n",
+ "\n",
+ "#@markdown Run each section with Shift+Enter\n",
+ "\n",
+ "#@markdown Double-click on section headers to show code.\n",
+ "\n",
+ "!apt update \n",
+ "!apt install -y --no-install-recommends \\\n",
+ " build-essential \\\n",
+ " curl \\\n",
+ " git \\\n",
+ " gnupg2 \\\n",
+ " make \\\n",
+ " cmake \\\n",
+ " ffmpeg \\\n",
+ " swig \\\n",
+ " libz-dev \\\n",
+ " unzip \\\n",
+ " zlib1g-dev \\\n",
+ " libglfw3 \\\n",
+ " libglfw3-dev \\\n",
+ " libxrandr2 \\\n",
+ " libxinerama-dev \\\n",
+ " libxi6 \\\n",
+ " libxcursor-dev \\\n",
+ " libgl1-mesa-dev \\\n",
+ " libgl1-mesa-glx \\\n",
+ " libglew-dev \\\n",
+ " libosmesa6-dev \\\n",
+ " lsb-release \\\n",
+ " ack-grep \\\n",
+ " patchelf \\\n",
+ " wget \\\n",
+ " xpra \\\n",
+ " xserver-xorg-dev \\\n",
+ " xvfb \\\n",
+ " python-opengl \\\n",
+ " ffmpeg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "QeDMsMOXUAkN"
+ },
+ "outputs": [],
+ "source": [
+ "#@title download mujoco\n",
+ "\n",
+ "MJC_PATH = '{}/mujoco'.format(SYM_PATH)\n",
+ "%mkdir $MJC_PATH\n",
+ "%cd $MJC_PATH\n",
+ "!wget -q https://www.roboti.us/download/mujoco200_linux.zip\n",
+ "!unzip -q mujoco200_linux.zip\n",
+ "%mv mujoco200_linux mujoco200\n",
+ "%rm mujoco200_linux.zip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "wTsf6RYGk_pz"
+ },
+ "outputs": [],
+ "source": [
+ "#@title update mujoco paths\n",
+ "\n",
+ "import os\n",
+ "\n",
+ "os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)\n",
+ "os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)\n",
+ "os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)\n",
+ "\n",
+ "## installation on colab does not find *.so files\n",
+ "## in LD_LIBRARY_PATH, copy over manually instead\n",
+ "!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "JI_nuhTulBvU"
+ },
+ "source": [
+ "Copy over `mjkey.txt` into `/content/cs285_f2021/mujoco` before this step"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "amF0DgEyklFl"
+ },
+ "outputs": [],
+ "source": [
+ "#@title clone and install mujoco-py\n",
+ "\n",
+ "%cd $MJC_PATH\n",
+ "!git clone https://github.com/openai/mujoco-py.git\n",
+ "%cd mujoco-py\n",
+ "%pip install -e .\n",
+ "\n",
+ "## cythonize at the first import\n",
+ "import mujoco_py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "X_aXQac0f3pr"
+ },
+ "outputs": [],
+ "source": [
+ "#@title clone homework repo\n",
+ "\n",
+ "%cd $SYM_PATH\n",
+ "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2021.git\n",
+ "%cd homework_fall2021/hw1\n",
+ "%pip install -r requirements_colab.txt\n",
+ "%pip install -e ."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "8y_M1tGxmGhT"
+ },
+ "outputs": [],
+ "source": [
+ "#@title set up virtual display\n",
+ "\n",
+ "from pyvirtualdisplay import Display\n",
+ "\n",
+ "display = Display(visible=0, size=(1400, 900))\n",
+ "display.start()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 438
+ },
+ "colab_type": "code",
+ "id": "y7cywOEgo4a8",
+ "outputId": "c91293e2-0424-4427-b57e-0e12653c991a"
+ },
+ "outputs": [],
+ "source": [
+ "#@title test virtual display\n",
+ "\n",
+ "#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!\n",
+ "\n",
+ "import gym\n",
+ "from cs285.infrastructure.colab_utils import (\n",
+ " wrap_env,\n",
+ " show_video\n",
+ ") \n",
+ "\n",
+ "env = wrap_env(gym.make(\"Ant-v2\"))\n",
+ "\n",
+ "observation = env.reset()\n",
+ "for i in range(100):\n",
+ " env.render(mode='rgb_array')\n",
+ " obs, rew, term, _ = env.step(env.action_space.sample() ) \n",
+ " if term:\n",
+ " break;\n",
+ " \n",
+ "env.close()\n",
+ "print('Loading video...')\n",
+ "show_video()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "eQx7oDGeeKWj"
+ },
+ "source": [
+ "## Editing Code\n",
+ "\n",
+ "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2021/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "UunygyDXrx7k"
+ },
+ "source": [
+ "## Run Behavior Cloning (Problem 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "enh5ZMHftEO7"
+ },
+ "outputs": [],
+ "source": [
+ "#@title imports\n",
+ "\n",
+ "import os\n",
+ "import time\n",
+ "import numpy as np\n",
+ "\n",
+ "from cs285.infrastructure.rl_trainer import RL_Trainer\n",
+ "from cs285.agents.bc_agent import BCAgent\n",
+ "from cs285.policies.loaded_gaussian_policy import LoadedGaussianPolicy\n",
+ "\n",
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "imnAkQ6jryL7"
+ },
+ "outputs": [],
+ "source": [
+ "#@title runtime arguments\n",
+ "\n",
+ "class Args:\n",
+ "\n",
+ " def __getitem__(self, key):\n",
+ " return getattr(self, key)\n",
+ "\n",
+ " def __setitem__(self, key, val):\n",
+ " setattr(self, key, val)\n",
+ "\n",
+ " #@markdown expert data\n",
+ " expert_policy_file = 'cs285/policies/experts/Ant.pkl' #@param\n",
+ " expert_data = 'cs285/expert_data/expert_data_Ant-v2.pkl' #@param\n",
+ " env_name = 'Ant-v2' #@param ['Ant-v2', 'Humanoid-v2', 'Walker2d-v2', 'HalfCheetah-v2', 'Hopper-v2']\n",
+ " exp_name = 'test_bc_ant' #@param\n",
+ " do_dagger = False #@param {type: \"boolean\"}\n",
+ " ep_len = 1000 #@param {type: \"integer\"}\n",
+ " save_params = False #@param {type: \"boolean\"}\n",
+ "\n",
+ " num_agent_train_steps_per_iter = 1000 #@param {type: \"integer\"})\n",
+ " n_iter = 1 #@param {type: \"integer\"})\n",
+ "\n",
+ " #@markdown batches & buffers\n",
+ " batch_size = 1000 #@param {type: \"integer\"})\n",
+ " eval_batch_size = 1000 #@param {type: \"integer\"}\n",
+ " train_batch_size = 100 #@param {type: \"integer\"}\n",
+ " max_replay_buffer_size = 1000000 #@param {type: \"integer\"}\n",
+ "\n",
+ " #@markdown network\n",
+ " n_layers = 2 #@param {type: \"integer\"}\n",
+ " size = 64 #@param {type: \"integer\"}\n",
+ " learning_rate = 5e-3 #@param {type: \"number\"}\n",
+ "\n",
+ " #@markdown logging\n",
+ " video_log_freq = 5 #@param {type: \"integer\"}\n",
+ " scalar_log_freq = 1 #@param {type: \"integer\"}\n",
+ "\n",
+ " #@markdown gpu & run-time settings\n",
+ " no_gpu = False #@param {type: \"boolean\"}\n",
+ " which_gpu = 0 #@param {type: \"integer\"}\n",
+ " seed = 1 #@param {type: \"integer\"}\n",
+ "\n",
+ "args = Args()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "fLnU1evmss4I"
+ },
+ "outputs": [],
+ "source": [
+ "#@title define `BC_Trainer`\n",
+ "class BC_Trainer(object):\n",
+ "\n",
+ " def __init__(self, params):\n",
+ " #######################\n",
+ " ## AGENT PARAMS\n",
+ " #######################\n",
+ "\n",
+ " agent_params = {\n",
+ " 'n_layers': params['n_layers'],\n",
+ " 'size': params['size'],\n",
+ " 'learning_rate': params['learning_rate'],\n",
+ " 'max_replay_buffer_size': params['max_replay_buffer_size'],\n",
+ " }\n",
+ "\n",
+ " self.params = params\n",
+ " self.params['agent_class'] = BCAgent ## TODO: look in here and implement this\n",
+ " self.params['agent_params'] = agent_params\n",
+ "\n",
+ " ################\n",
+ " ## RL TRAINER\n",
+ " ################\n",
+ "\n",
+ " self.rl_trainer = RL_Trainer(self.params) ## TODO: look in here and implement this\n",
+ "\n",
+ " #######################\n",
+ " ## LOAD EXPERT POLICY\n",
+ " #######################\n",
+ "\n",
+ " print('Loading expert policy from...', self.params['expert_policy_file'])\n",
+ " self.loaded_expert_policy = LoadedGaussianPolicy(self.params['expert_policy_file'])\n",
+ " print('Done restoring expert policy...')\n",
+ "\n",
+ " def run_training_loop(self):\n",
+ "\n",
+ " self.rl_trainer.run_training_loop(\n",
+ " n_iter=self.params['n_iter'],\n",
+ " initial_expertdata=self.params['expert_data'],\n",
+ " collect_policy=self.rl_trainer.agent.actor,\n",
+ " eval_policy=self.rl_trainer.agent.actor,\n",
+ " relabel_with_expert=self.params['do_dagger'],\n",
+ " expert_policy=self.loaded_expert_policy,\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "7UkzHBfxsxH8"
+ },
+ "outputs": [],
+ "source": [
+ "#@title create directory for logging\n",
+ "\n",
+ "if args.do_dagger:\n",
+ " logdir_prefix = 'q2_' # The autograder uses the prefix `q2_`\n",
+ " assert args.n_iter>1, ('DAgger needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).')\n",
+ "else:\n",
+ " logdir_prefix = 'q1_' # The autograder uses the prefix `q1_`\n",
+ " assert args.n_iter==1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)')\n",
+ "\n",
+ "data_path ='/content/cs285_f2021/hw1/data'\n",
+ "if not (os.path.exists(data_path)):\n",
+ " os.makedirs(data_path)\n",
+ "logdir = logdir_prefix + args.exp_name + '_' + args.env_name + \\\n",
+ " '_' + time.strftime(\"%d-%m-%Y_%H-%M-%S\")\n",
+ "logdir = os.path.join(data_path, logdir)\n",
+ "args['logdir'] = logdir\n",
+ "if not(os.path.exists(logdir)):\n",
+ " os.makedirs(logdir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "_qQb789_syt0"
+ },
+ "outputs": [],
+ "source": [
+ "## run training\n",
+ "print(args.logdir)\n",
+ "trainer = BC_Trainer(args)\n",
+ "trainer.run_training_loop()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "75M0MlR5tUIb"
+ },
+ "outputs": [],
+ "source": [
+ "#@markdown You can visualize your runs with tensorboard from within the notebook\n",
+ "\n",
+ "%load_ext tensorboard\n",
+ "%tensorboard --logdir /content/cs285_f2021/hw1/data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "ff9onuUPfPEa"
+ },
+ "source": [
+ "## Running DAgger (Problem 2)\n",
+ "Modify the settings above:\n",
+ "1. check the `do_dagger` box\n",
+ "2. set `n_iters` to `10`\n",
+ "and then rerun the code."
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "run_hw1.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/hw1/cs285/scripts/run_hw1.py b/hw1/cs285/scripts/run_hw1.py
new file mode 100644
index 00000000..2a4a73de
--- /dev/null
+++ b/hw1/cs285/scripts/run_hw1.py
@@ -0,0 +1,120 @@
+import os
+import time
+
+from cs285.infrastructure.rl_trainer import RL_Trainer
+from cs285.agents.bc_agent import BCAgent
+from cs285.policies.loaded_gaussian_policy import LoadedGaussianPolicy
+
+class BC_Trainer(object):
+
+ def __init__(self, params):
+
+ #######################
+ ## AGENT PARAMS
+ #######################
+
+ agent_params = {
+ 'n_layers': params['n_layers'],
+ 'size': params['size'],
+ 'learning_rate': params['learning_rate'],
+ 'max_replay_buffer_size': params['max_replay_buffer_size'],
+ }
+
+ self.params = params
+ self.params['agent_class'] = BCAgent ## HW1: you will modify this
+ self.params['agent_params'] = agent_params
+
+ ################
+ ## RL TRAINER
+ ################
+
+ self.rl_trainer = RL_Trainer(self.params) ## HW1: you will modify this
+
+ #######################
+ ## LOAD EXPERT POLICY
+ #######################
+
+ print('Loading expert policy from...', self.params['expert_policy_file'])
+ self.loaded_expert_policy = LoadedGaussianPolicy(self.params['expert_policy_file'])
+ print('Done restoring expert policy...')
+
+ def run_training_loop(self):
+
+ self.rl_trainer.run_training_loop(
+ n_iter=self.params['n_iter'],
+ initial_expertdata=self.params['expert_data'],
+ collect_policy=self.rl_trainer.agent.actor,
+ eval_policy=self.rl_trainer.agent.actor,
+ relabel_with_expert=self.params['do_dagger'],
+ expert_policy=self.loaded_expert_policy,
+ )
+
+
+def main():
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--expert_policy_file', '-epf', type=str, required=True) # relative to where you're running this script from
+ parser.add_argument('--expert_data', '-ed', type=str, required=True) #relative to where you're running this script from
+ parser.add_argument('--env_name', '-env', type=str, help='choices: Ant-v2, Humanoid-v2, Walker-v2, HalfCheetah-v2, Hopper-v2', required=True)
+ parser.add_argument('--exp_name', '-exp', type=str, default='pick an experiment name', required=True)
+ parser.add_argument('--do_dagger', action='store_true')
+ parser.add_argument('--ep_len', type=int)
+
+ parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000) # number of gradient steps for training policy (per iter in n_iter)
+ parser.add_argument('--n_iter', '-n', type=int, default=1)
+
+ parser.add_argument('--batch_size', type=int, default=1000) # training data collected (in the env) during each iteration
+ parser.add_argument('--eval_batch_size', type=int,
+ default=1000) # eval data collected (in the env) for logging metrics
+ parser.add_argument('--train_batch_size', type=int,
+ default=100) # number of sampled data points to be used per gradient/train step
+
+ parser.add_argument('--n_layers', type=int, default=2) # depth, of policy to be learned
+ parser.add_argument('--size', type=int, default=64) # width of each layer, of policy to be learned
+ parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) # LR for supervised learning
+
+ parser.add_argument('--video_log_freq', type=int, default=5)
+ parser.add_argument('--scalar_log_freq', type=int, default=1)
+ parser.add_argument('--no_gpu', '-ngpu', action='store_true')
+ parser.add_argument('--which_gpu', type=int, default=0)
+ parser.add_argument('--max_replay_buffer_size', type=int, default=1000000)
+ parser.add_argument('--save_params', action='store_true')
+ parser.add_argument('--seed', type=int, default=1)
+ args = parser.parse_args()
+
+ # convert args to dictionary
+ params = vars(args)
+
+ ##################################
+ ### CREATE DIRECTORY FOR LOGGING
+ ##################################
+
+ if args.do_dagger:
+ # Use this prefix when submitting. The auto-grader uses this prefix.
+ logdir_prefix = 'q2_'
+ assert args.n_iter>1, ('DAGGER needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).')
+ else:
+ # Use this prefix when submitting. The auto-grader uses this prefix.
+ logdir_prefix = 'q1_'
+ assert args.n_iter==1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)')
+
+ ## directory for logging
+ data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
+ if not (os.path.exists(data_path)):
+ os.makedirs(data_path)
+ logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
+ logdir = os.path.join(data_path, logdir)
+ params['logdir'] = logdir
+ if not(os.path.exists(logdir)):
+ os.makedirs(logdir)
+
+
+ ###################
+ ### RUN TRAINING
+ ###################
+
+ trainer = BC_Trainer(params)
+ trainer.run_training_loop()
+
+if __name__ == "__main__":
+ main()
diff --git a/hw1/cs285_hw1.pdf b/hw1/cs285_hw1.pdf
new file mode 100644
index 00000000..4ca982d1
Binary files /dev/null and b/hw1/cs285_hw1.pdf differ
diff --git a/hw1/installation.md b/hw1/installation.md
new file mode 100644
index 00000000..95c2ecfa
--- /dev/null
+++ b/hw1/installation.md
@@ -0,0 +1,71 @@
+## Install mujoco:
+```
+mkdir ~/.mujoco
+cd ~/.mujoco
+wget https://www.roboti.us/download/mujoco200_linux.zip
+unzip mujoco200_linux.zip
+mv mujoco200_linux mujoco200
+rm mujoco200_linux.zip
+cp .
+```
+The above instructions download MuJoCo for Linux. If you are on Mac or Windows, you will need to change the `wget` address to either
+`https://www.roboti.us/download/mujoco200_macos.zip` or `https://www.roboti.us/download/mujoco200_win64.zip`.
+
+Finally, add the following to bottom of your bashrc:
+```
+export LD_LIBRARY_PATH=~/.mujoco/mujoco200/bin/
+```
+
+## Install other dependencies
+
+
+There are two options:
+
+A. (Recommended) Install with conda:
+
+ 1. Install conda, if you don't already have it, by following the instructions at [this link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/)
+
+ ```
+
+ This install will modify the `PATH` variable in your bashrc.
+ You need to open a new terminal for that path change to take place (to be able to find 'conda' in the next step).
+
+ 2. Create a conda environment that will contain python 3:
+ ```
+ conda create -n cs285 python=3.6
+ ```
+
+ 3. activate the environment (do this every time you open a new terminal and want to run code):
+ ```
+ source activate cs285
+ ```
+
+ 4. Install the requirements into this conda environment
+ ```
+ pip install --user -r requirements.txt
+ ```
+
+ 5. Allow your code to be able to see 'cs285'
+ ```
+ cd
+ $ pip install -e .
+ ```
+
+This conda environment requires activating it every time you open a new terminal (in order to run code), but the benefit is that the required dependencies for this codebase will not affect existing/other versions of things on your computer. This stand-alone environment will have everything that is necessary.
+
+
+B. Install on system Python:
+ ```
+ pip install -r requirements.txt
+ cd
+ pip install -e .
+ ```
+
+
+## Debugging issues with installing `mujoco-py`
+
+If you run into issues with installing `mujoco-py` (especially on MacOS), here are a few common pointers to help:
+ 1. If you run into GCC issues, consider switching to GCC7 (`brew install gcc@7`)
+ 2. [Try this](https://github.com/hashicorp/terraform/issues/23033#issuecomment-543507812) if you run into developer verification issues (use due diligence when granting permissions to code from unfamiliar sources)
+ 3. StackOverflow is your friend, feel free to shamelessly look up your error and get in touch with your classmates or instructors
+ 4. If nothing works and you are frustrated beyond repair, consider using the Colab version of the homework!
diff --git a/hw1/requirements.txt b/hw1/requirements.txt
new file mode 100644
index 00000000..2c844233
--- /dev/null
+++ b/hw1/requirements.txt
@@ -0,0 +1,13 @@
+numpy
+gym==0.17.2
+mujoco-py==2.0.2.2
+tensorboard==2.3.0
+tensorboardX==1.8
+matplotlib==2.2.2
+ipython==6.4.0
+moviepy==1.0.0
+pyvirtualdisplay==1.3.2
+torch==1.6.0
+opencv-python==4.4.0.42
+ipdb==0.13.3
+box2d-py
diff --git a/hw1/requirements_colab.txt b/hw1/requirements_colab.txt
new file mode 100644
index 00000000..f43a1ca3
--- /dev/null
+++ b/hw1/requirements_colab.txt
@@ -0,0 +1,11 @@
+gym==0.17.2
+tensorboard==2.3.0
+tensorboardX==1.8
+matplotlib==2.2.2
+ipython==6.4.0
+moviepy==1.0.0
+pyvirtualdisplay==1.3.2
+torch==1.6.0
+opencv-python==4.4.0.42
+ipdb==0.13.3
+box2d-py
diff --git a/hw1/setup.py b/hw1/setup.py
new file mode 100644
index 00000000..3cc1886e
--- /dev/null
+++ b/hw1/setup.py
@@ -0,0 +1,8 @@
+# setup.py
+from setuptools import setup
+
+setup(
+ name='cs285',
+ version='0.1.0',
+ packages=['cs285'],
+)
\ No newline at end of file