Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update #4

Merged
merged 1 commit into from
Jul 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ AgentZero is a Lightweight PyTorch Reinforcement Learning Framework.
```bash
git clone https://github.com/zhoubin-me/AgentZero
cd AgentZero
pip install -e .
pip install torch torchvision torchaudio
conda env create -f environment.yml
conda activate py3x
```

Simple run
Expand Down
8 changes: 1 addition & 7 deletions agent0/common/atari_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,7 @@ def make_atari(env_id, num_envs):
lambda x: FrameStack(x, 4, False),
RecordEpisodeStatistics,
ClipRewardEnv,
EpisodicLifeEnv
]
if num_envs > 1:
envs = gym.make_vec(f'{env_id}NoFrameskip-v4', num_envs, wrappers=wrappers)
else:
envs = gym.make(f'{env_id}NoFrameskip-v4')
for wrapper in wrappers:
envs = wrapper(envs)
envs = gym.make_vec(f'{env_id}NoFrameskip-v4', num_envs, wrappers=wrappers)
return envs

17 changes: 10 additions & 7 deletions agent0/deepq/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from agent0.deepq.new_trainer import Trainer
from agent0.deepq.config import Config
from agent0.deepq.new_config import ExpConfig
import hydra
from hydra.core.config_store import ConfigStore

@hydra.main(version_base=None, config_name="config")
def main(cfg: ExpConfig):
Trainer(cfg).run()

if __name__ == "__main__":
cfg = Config()
cfg.update()
print(cfg)
trainer = Trainer()
trainer.setup(vars(cfg))
trainer.run()
cs = ConfigStore.instance()
cs.store(name="config", node=ExpConfig)
main()
131 changes: 131 additions & 0 deletions agent0/deepq/new_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from agent0.deepq.new_config import ExpConfig
from agent0.common.atari_wrappers import make_atari
from agent0.deepq.new_model import DeepQNet
from agent0.common.utils import DataLoaderX, DataPrefetcher
from agent0.deepq.replay import ReplayDataset

import torch
import torch.nn.functional as F
import numpy as np
from lz4.block import compress
from copy import deepcopy

class Actor:
def __init__(self, cfg: ExpConfig):

self.cfg = cfg
self.envs = make_atari(self.cfg.game, self.cfg.actor.num_envs)
self.obs, _ = self.envs.reset()

self.action_dim = self.envs.action_space[0].n
self.obs_shape = self.envs.observation_space.shape
self.device = self.cfg.actor.device.value

self.model = DeepQNet(self.action_dim, self.obs_shape[1]).to(self.device)

def act(self, st, epsilon):
qt = self.model(st)
action_random = np.random.randint(0, self.action_dim, self.cfg.actor.num_envs)
qt_max, qt_arg_max = qt.max(dim=-1)
action_greedy = qt_arg_max.cpu().numpy()
action = np.where(np.random.rand(self.cfg.actor.num_envs) > epsilon, action_greedy, action_random)
return action, qt_max.mean().item()

def sample(self, steps, epsilon, model):
self.model = model
rs, qs, data = [], [], []
step = 0
while True:
step += 1
with torch.no_grad():
st = torch.from_numpy(self.obs).to(self.device).float().div(255.0)
action, qt_max = self.act(st, epsilon)

qs.append(qt_max)
obs_next, reward, terminal, truncated, info = self.envs.step(action)

done = np.logical_and(terminal, np.logical_not(truncated))
for st, at, rt, dt, st_next in zip(self.obs, action, reward, done, obs_next):
data.append((compress(np.concatenate((st, st_next), axis=0)), at, rt, dt))

self.obs = obs_next

if 'final_info' in info:
final_infos = info['final_info'][info['_final_info']]
for stat in final_infos:
rs.append(stat['episode']['r'][0])

if step > steps:
break

return data, rs, qs

def close(self):
self.envs.close()

class Learner:
def __init__(self, cfg: ExpConfig):
self.cfg = cfg
self.device = self.cfg.trainer.device.value

dummy_env = make_atari(self.cfg.game, 1)
self.action_dim = dummy_env.action_space[0].n
self.obs_shape = dummy_env.observation_space.shape
dummy_env.close()

self.model = DeepQNet(self.action_dim, self.obs_shape[1]).to(self.device)
self.model_target = deepcopy(self.model)
self.optimizer = torch.optim.Adam(
self.model.parameters(),
self.cfg.trainer.learning_rate,
eps=1e-2 / self.cfg.trainer.batch_size)

self.update_steps = 0
self.replay = ReplayDataset(cfg)
self.batch_indices = torch.arange(self.cfg.trainer.batch_size).to(self.device)

def get_data_fetcher(self):

data_loader = DataLoaderX(self.replay, batch_size=self.cfg.trainer.batch_size,
shuffle=True, num_workers=2, pin_memory=True)
data_fetcher = DataPrefetcher(data_loader, self.device)
return data_fetcher

def train_step_dqn(self, states, next_states, actions, terminals, rewards):
with torch.no_grad():
q_next = self.model_target(next_states)
a_next = q_next.argmax(dim=-1)
q_next = q_next[self.batch_indices, a_next]
q_target = rewards + self.cfg.algo.discount * (1 - terminals) * q_next

q = self.model(states)[self.batch_indices, actions]
loss = F.smooth_l1_loss(q, q_target)
return loss

def train_step(self, data=None):
if data is None:
try:
data = self.data_fetcher.next()
except (StopIteration, AttributeError):
self.data_fetcher = self.get_data_fetcher()
data = self.data_fetcher.next()

frames, actions, rewards, terminals = data
frames = frames.reshape(self.cfg.trainer.batch_size, -1, *self.obs_shape[2:]).float().div(255.0)
states = frames[:, :4, :, :]
next_states = frames[:, -4:, :, :]
actions = actions.long()
terminals = terminals.float()
rewards = rewards.float()

loss = self.train_step_dqn(states, next_states, actions, terminals, rewards)

self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.update_steps += 1

if self.update_steps % self.cfg.trainer.target_update_freq == 0:
self.model_target = deepcopy(self.model)

return {'loss' : loss.item()}
57 changes: 57 additions & 0 deletions agent0/deepq/new_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from dataclasses import dataclass, field
from enum import Enum
from typing import Literal

AlgoEnum = Enum('Algo', {k: i for i, k in enumerate(['dqn', 'c51'])})
ActorEnum = Enum('Actor', {k: i for i, k in enumerate(['greedy', 'random', 'eps-greedy'])})
ReplayEnum = Enum('Replay', {k: i for i, k in enumerate(['uniform', 'prior'])})
ModeEnum = Enum('Mode', {k: i for i, k in enumerate(['train', 'finetune', 'play'])})
GameEnum = Enum('Game', {k: i for i, k in enumerate(['atari', 'mujoco'])})
DeviceEnum = Enum('Device', {'cuda': 'cuda', 'cpu': 'cpu'})

@dataclass
class AlgoConfig:
name: AlgoEnum = AlgoEnum.dqn
discount: float = 0.99

@dataclass
class TrainerConfig:
batch_size: int = 512
learning_rate: float = 5e-4
total_steps: int = int(1e7)

training_start_steps: int = int(1e5)
exploration_steps: int = int(1e6)
target_update_freq: int = 500
log_freq: int = 100
device: DeviceEnum = DeviceEnum.cuda

learner_steps: int = 20

@dataclass
class ActorConfig:
policy: ActorEnum = ActorEnum.random
num_envs: int = 16
actor_steps: int = 80
min_eps: float = 0.01
test_eps: float = 0.001
device: DeviceEnum = DeviceEnum.cuda

@dataclass
class ReplayConfig:
size: int = int(1e6)
policy: ReplayEnum = ReplayEnum.uniform

@dataclass
class ExpConfig:
game: str = "Breakout"
env: GameEnum = GameEnum.atari
seed: int = 42
name: str = ""
mode: ModeEnum = ModeEnum.train
logdir: str = "output"

algo: AlgoConfig = field(default_factory=AlgoConfig)
trainer: TrainerConfig = field(default_factory=TrainerConfig)
actor: ActorConfig = field(default_factory=ActorConfig)
replay: ReplayConfig = field(default_factory=ReplayConfig)
39 changes: 39 additions & 0 deletions agent0/deepq/new_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import torch
import torch.nn as nn


def init(m, gain=1.0):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
nn.init.orthogonal_(m.weight.data, gain)
nn.init.zeros_(m.bias.data)


def init_xavier(m, gain=1.0):
if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
torch.nn.init.xavier_uniform_(m.weight, gain=gain)
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)


class DeepQNet(nn.Module):
def __init__(self, action_dim, chan_dim):
super(DeepQNet, self).__init__()

self.convs = nn.Sequential(
nn.Conv2d(chan_dim, 32, 8, stride=4), nn.ReLU(),
nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(),
nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(), nn.Flatten())
self.convs.apply(lambda m: init(m, nn.init.calculate_gain('relu')))


self.first_dense = nn.Sequential(nn.Linear(64 * 7 * 7, 512), nn.ReLU())
self.first_dense.apply(lambda m: init(m, nn.init.calculate_gain('relu')))

self.p = nn.Linear(512, action_dim)
self.p.apply(lambda m: init(m, 0.01))

def forward(self, x):
x = self.convs(x)
x = self.first_dense(x)
x = self.p(x)
return x
73 changes: 24 additions & 49 deletions agent0/deepq/new_trainer.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,61 @@
import json
import time
from abc import ABC

import numpy as np
import torch
from agent0.common.utils import LinearSchedule, set_random_seed
from agent0.deepq.actor import Actor
from agent0.deepq.agent import Agent
from agent0.deepq.config import Config
from agent0.deepq.new_agent import Learner, Actor
from agent0.deepq.new_config import ExpConfig

from tensorboardX import SummaryWriter
import logging
from tqdm import tqdm

class Trainer:
def __init__(self):
def __init__(self, cfg: ExpConfig):
self.cfg = cfg
print(cfg)

self.Rs, self.Qs, self.TRs, self.Ls, self.ITRs, self.velocity = [], [], [], [], [], []
self.cfg = None
self.agent = None
self.epsilon = None
self.epsilon_schedule = None
self.actors = None
self.frame_count = None
self.Rs, self.Qs, self.TRs, self.Ls, self.ITRs = [], [], [], [], []
self.best = float('-inf')
self.sample_ops = None

def setup(self, config):
self.cfg = Config(**config)
self.cfg.update_atoms()
set_random_seed(self.cfg.random_seed)
print("input args:\n", json.dumps(vars(self.cfg), indent=4, separators=(",", ":")))

self.agent = Agent(**config)
self.epsilon_schedule = LinearSchedule(1.0, self.cfg.min_eps, self.cfg.exploration_steps)
self.actor = Actor(rank=0, **config)
set_random_seed(cfg.seed)
self.learner = Learner(cfg)
self.actor = Actor(cfg)
self.epsilon_schedule = LinearSchedule(1.0, cfg.actor.min_eps, cfg.trainer.exploration_steps)

self.frame_count = 0
self.best = float('-inf')
self.epsilon = 1.0
self.writer = SummaryWriter('output')
self.writer = SummaryWriter('output2')
self.num_transitions = self.cfg.actor.actor_steps * self.cfg.actor.num_envs
self.Ls, self.Rs, self.Qs = [], [], []

def step(self):
tic = time.time()

transitions, rs, qs, rank, fps, best_ep = self.actor.sample(self.cfg.actor_steps, self.epsilon, self.agent.model)
transitions, returns, qmax = self.actor.sample(self.cfg.actor.actor_steps, self.epsilon, self.learner.model)
self.Qs.extend(qmax)
self.Rs.extend(returns)
# Actors
if len(transitions) > 0:
self.agent.replay.extend(transitions)
if len(best_ep) > 0:
self.agent.replay.extend_ep_best(best_ep)
self.learner.replay.extend(transitions)

self.epsilon = self.epsilon_schedule(self.cfg.actor_steps * self.cfg.num_envs)
self.frame_count += self.cfg.actor_steps * self.cfg.num_envs
self.epsilon = self.epsilon_schedule(self.num_transitions)
self.frame_count += self.num_transitions

self.Rs += rs
self.Qs += qs
# Start training at
if len(self.agent.replay) > self.cfg.start_training_step:
data = [self.agent.train_step() for _ in range(self.cfg.agent_train_steps)]
if len(self.learner.replay) > self.cfg.trainer.training_start_steps:
data = [self.learner.train_step() for _ in range(self.cfg.trainer.learner_steps)]
loss = [x['loss'] for x in data]
loss = torch.stack(loss)
self.Ls += loss.tolist()
self.Ls.extend(loss)

toc = time.time()
self.velocity.append(self.cfg.actor_steps * self.cfg.num_envs / (toc - tic))

result = dict(
epsilon=self.epsilon,
frames=self.frame_count,
velocity=np.mean(self.velocity[-20:]) if len(self.velocity) > 0 else None,
velocity=self.num_transitions / (toc - tic),
loss=np.mean(self.Ls[-20:]) if len(self.Ls) > 0 else None,
return_test=np.mean(self.ITRs) if len(self.ITRs) > 0 else None,
return_train=np.mean(self.Rs[-20:]) if len(self.Rs) > 0 else None,
return_train_max=np.max(self.Rs) if len(self.Rs) > 0 else None,
return_test_max=np.max(self.TRs) if len(self.TRs) > 0 else None,
qmax=np.mean(self.Qs[-100:]) if len(self.Qs) > 0 else None
)
return result

def run(self):
trainer_steps = self.cfg.total_steps // (self.cfg.num_envs * self.cfg.actor_steps) + 1
trainer_steps = self.cfg.trainer.total_steps // self.num_transitions + 1
with tqdm(range(trainer_steps)) as t:
for _ in t:
result = self.step()
Expand Down
Loading