From 69fc4d0b1f6786c17a39fd1ec10e9f330740eb0b Mon Sep 17 00:00:00 2001 From: Lmh-sys <493908689@qq.com> Date: Wed, 1 Nov 2023 09:15:57 +0800 Subject: [PATCH] =?UTF-8?q?=E9=80=82=E9=85=8D=E6=96=B0=E7=89=88gym=3D0.25.?= =?UTF-8?q?2=E5=92=8Cpettingzoo=3D1.24.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ Agent.py | 6 +++--- MADDPG.py | 7 ++++--- evaluate.py | 15 ++++++++------- main.py | 25 +++++++++++++------------ 5 files changed, 30 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index f893826..ebf01cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .idea/ /results/ /.venv/ +.vscode/ +__pycache__ \ No newline at end of file diff --git a/Agent.py b/Agent.py index c113d82..9097da3 100644 --- a/Agent.py +++ b/Agent.py @@ -10,12 +10,12 @@ class Agent: """Agent that can interact with environment from pettingzoo""" - def __init__(self, obs_dim, act_dim, global_obs_dim, actor_lr, critic_lr): - self.actor = MLPNetwork(obs_dim, act_dim) + def __init__(self, obs_dim, act_dim, global_obs_dim, actor_lr, critic_lr, device): + self.actor = MLPNetwork(obs_dim, act_dim).to(device) # critic input all the observations and actions # if there are 3 agents for example, the input for critic is (obs1, obs2, obs3, act1, act2, act3) - self.critic = MLPNetwork(global_obs_dim, 1) + self.critic = MLPNetwork(global_obs_dim, 1).to(device) self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_lr) self.target_actor = deepcopy(self.actor) diff --git a/MADDPG.py b/MADDPG.py index 2b76659..12760c4 100644 --- a/MADDPG.py +++ b/MADDPG.py @@ -27,6 +27,7 @@ def setup_logger(filename): class MADDPG: """A MADDPG(Multi Agent Deep Deterministic Policy Gradient) agent""" + device = 'cuda' def __init__(self, dim_info, capacity, batch_size, actor_lr, critic_lr, res_dir): # sum all the dims of each agent to get input dim for critic @@ -35,8 +36,8 @@ def __init__(self, dim_info, capacity, batch_size, actor_lr, critic_lr, res_dir) self.agents = {} self.buffers = {} for agent_id, (obs_dim, act_dim) in dim_info.items(): - self.agents[agent_id] = Agent(obs_dim, act_dim, global_obs_act_dim, actor_lr, critic_lr) - self.buffers[agent_id] = Buffer(capacity, obs_dim, act_dim, 'cpu') + self.agents[agent_id] = Agent(obs_dim, act_dim, global_obs_act_dim, actor_lr, critic_lr, self.device) + self.buffers[agent_id] = Buffer(capacity, obs_dim, act_dim, self.device) self.dim_info = dim_info self.batch_size = batch_size @@ -81,7 +82,7 @@ def sample(self, batch_size): def select_action(self, obs): actions = {} for agent, o in obs.items(): - o = torch.from_numpy(o).unsqueeze(0).float() + o = torch.from_numpy(o).unsqueeze(0).float().to(self.device) a = self.agents[agent].action(o) # torch.Size([1, action_size]) # NOTE that the output is a tensor, convert it to int before input to the environment actions[agent] = a.squeeze(0).argmax().item() diff --git a/evaluate.py b/evaluate.py index e4e7dd6..8c466c0 100644 --- a/evaluate.py +++ b/evaluate.py @@ -10,8 +10,8 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('env_name', type=str, default='simple_adversary_v2', help='name of the env', - choices=['simple_adversary_v2', 'simple_spread_v2', 'simple_tag_v2']) + parser.add_argument('env_name', type=str, default='simple_spread_v3', help='name of the env', + choices=['simple_adversary_v3', 'simple_spread_v3', 'simple_tag_v3']) parser.add_argument('folder', type=str, help='name of the folder where model is saved') parser.add_argument('--episode-num', type=int, default=10, help='total episode num during evaluation') parser.add_argument('--episode-length', type=int, default=50, help='steps per episode') @@ -32,19 +32,20 @@ # reward of each episode of each agent episode_rewards = {agent: np.zeros(args.episode_num) for agent in env.agents} for episode in range(args.episode_num): - states = env.reset() + states, _ = env.reset() agent_reward = {agent: 0 for agent in env.agents} # agent reward of the current episode frame_list = [] # used to save gif while env.agents: # interact with the env for an episode actions = maddpg.select_action(states) - next_states, rewards, dones, infos = env.step(actions) - frame_list.append(Image.fromarray(env.render(mode='rgb_array'))) + next_states, rewards, terminated, truncated, info = env.step(actions) + dones = terminated or truncated + frame_list.append(Image.fromarray(env.render())) states = next_states for agent_id, reward in rewards.items(): # update reward agent_reward[agent_id] += reward - env.close() + # env.close() message = f'episode {episode + 1}, ' # episode finishes, record reward for agent_id, reward in agent_reward.items(): @@ -54,7 +55,7 @@ # save gif frame_list[0].save(os.path.join(gif_dir, f'out{gif_num + episode + 1}.gif'), save_all=True, append_images=frame_list[1:], duration=1, loop=0) - + env.close() # training finishes, plot reward fig, ax = plt.subplots() x = range(1, args.episode_num + 1) diff --git a/main.py b/main.py index 4aecb98..2337f4e 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import matplotlib.pyplot as plt import numpy as np -from pettingzoo.mpe import simple_adversary_v2, simple_spread_v2, simple_tag_v2 +from pettingzoo.mpe import simple_adversary_v3, simple_spread_v3, simple_tag_v3 from MADDPG import MADDPG @@ -11,12 +11,12 @@ def get_env(env_name, ep_len=25): """create environment and get observation and action dimension of each agent in this environment""" new_env = None - if env_name == 'simple_adversary_v2': - new_env = simple_adversary_v2.parallel_env(max_cycles=ep_len) - if env_name == 'simple_spread_v2': - new_env = simple_spread_v2.parallel_env(max_cycles=ep_len) - if env_name == 'simple_tag_v2': - new_env = simple_tag_v2.parallel_env(max_cycles=ep_len) + if env_name == 'simple_adversary_v3': + new_env = simple_adversary_v3.parallel_env(max_cycles=ep_len) + if env_name == 'simple_spread_v3': + new_env = simple_spread_v3.parallel_env(max_cycles=ep_len,render_mode="rgb_array") + if env_name == 'simple_tag_v3': + new_env = simple_tag_v3.parallel_env(max_cycles=ep_len) new_env.reset() _dim_info = {} @@ -30,8 +30,8 @@ def get_env(env_name, ep_len=25): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('env_name', type=str, default='simple_adversary_v2', help='name of the env', - choices=['simple_adversary_v2', 'simple_spread_v2', 'simple_tag_v2']) + parser.add_argument('env_name', type=str, default='simple_adversary_v3', help='name of the env', + choices=['simple_adversary_v3', 'simple_spread_v3', 'simple_tag_v3']) parser.add_argument('--episode_num', type=int, default=30000, help='total episode num during training procedure') parser.add_argument('--episode_length', type=int, default=25, help='steps per episode') @@ -64,7 +64,7 @@ def get_env(env_name, ep_len=25): # reward of each episode of each agent episode_rewards = {agent_id: np.zeros(args.episode_num) for agent_id in env.agents} for episode in range(args.episode_num): - obs = env.reset() + obs, _ = env.reset() agent_reward = {agent_id: 0 for agent_id in env.agents} # agent reward of the current episode while env.agents: # interact with the env for an episode step += 1 @@ -72,8 +72,9 @@ def get_env(env_name, ep_len=25): action = {agent_id: env.action_space(agent_id).sample() for agent_id in env.agents} else: action = maddpg.select_action(obs) - - next_obs, reward, done, info = env.step(action) + + next_obs, reward, terminated, truncated, info = env.step(action) + done = terminated or truncated # env.render() maddpg.add(obs, action, reward, next_obs, done)