-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimulation.py
241 lines (199 loc) · 11 KB
/
simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""
The `simulation.py` module is part of a larger project aimed at exploring and analyzing the performance
of various strategies (agents) in solving multi-armed bandit problems. This module provides the foundational
classes necessary for setting up and running simulations, allowing for an empirical comparison of different
agents' abilities to maximize rewards in both stationary and non-stationary bandit environments.
Classes:
Experiment: Facilitates the execution of a single experiment, tracking the interactions between
an agent and a bandit problem over a series of steps. It captures key metrics such
as average rewards, optimal action selections, and cumulative rewards, enabling a detailed
analysis of an agent's performance.
Simulations: Manages the execution of multiple experiments across different agents and potentially
varying bandit problem configurations. This class automates the process of running
numerous simulations, collecting results, and providing tools for visualizing comparative
performance metrics such as average reward, optimal action frequency, and various forms
of regret analysis.
This module is designed to be flexible, supporting experiments with a wide range of bandit problem types
and agent strategies. By providing a structured approach to experimentation and simulation, it serves as a
valuable tool for researchers and practitioners in the field of reinforcement learning, particularly those
focused on the exploration-exploitation trade-off inherent in multi-armed bandit problems.
Usage:
The classes within this module can be used as part of a larger framework for conducting empirical
research on multi-armed bandit algorithms. Users can define their own agents and bandit problems,
then utilize the `Experiment` and `Simulations` classes to evaluate and compare their performance
under controlled conditions.
Example:
See the documentation for `Experiment` and `Simulations` classes for example usage.
"""
import numpy as np
import matplotlib.pyplot as plt
class Experiment:
"""
A class to conduct experiments with multi-armed bandit problems.
This class encapsulates the setup required to run simulations where an agent interacts with a bandit environment.
It tracks the actions taken by the agent, the rewards received, and how often the agent selects the optimal action.
This facilitates analysis of the agent's performance and learning behavior over time.
Attributes:
bandit (Bandit): An instance of a Bandit class, representing the multi-armed bandit problem.
agent (Agent): An instance of an Agent class, representing the strategy or algorithm used to select actions.
steps (int): The number of steps (iterations) the experiment will run.
optimal_actions (np.ndarray): An array tracking whether the agent selected the optimal action at each step.
optimal_values (np.ndarray): An array storing the optimal value (best possible reward) at each step.
reward_hist (np.ndarray): An array recording the reward received by the agent at each step.
Args:
bandit (Bandit): The bandit problem instance the agent will interact with.
agent (Agent): The agent that will be making decisions on which arm to pull.
steps (int): The total number of steps the experiment will execute.
"""
def __init__(self, bandit, agent, steps):
"""
Initializes the Experiment with a bandit, an agent, and a specified number of steps.
"""
self.bandit = bandit
self.agent = agent
self.steps = steps
self.optimal_actions = np.empty(steps)
self.optimal_values = np.empty(steps)
self.reward_hist = np.empty(steps)
def run(self):
"""
Executes the experiment over the specified number of steps.
At each step, the agent selects an action (arm to pull) based on its strategy,
receives a reward from the bandit, and updates its knowledge based on the outcome.
The experiment tracks the reward received, whether the chosen action was optimal,
and the optimal value at each step.
"""
for step in range(self.steps):
action = self.agent.choose_action()
reward = self.bandit.pull(action)
self.agent.update(action=action, reward=reward)
# Record whether the selected action was the optimal one
self.optimal_actions[step] = int(action == self.bandit.best_arm)
# Record the reward received
self.reward_hist[step] = reward
# Record the optimal value for reference
self.optimal_values[step] = self.bandit.best_value
@property
def name(self):
"""
A property to get the name of the agent being used in the experiment.
Returns:
str: The name of the agent.
"""
return self.agent.name
class Simulations:
"""
A class for conducting multiple simulation runs of bandit problems with various agents.
This class orchestrates the setup, execution, and visualization of experiments comparing
different strategies (agents) in interacting with a specified bandit environment over a series
of runs and steps.
Attributes:
runs (int): The number of simulation runs for each agent.
steps (int): The number of steps (actions taken) in each run.
num_arms (int): The number of arms in the bandit problem.
bandit (Bandit): The class of the bandit problem to be used in the simulations.
bandit_args (dict): Arguments required to initialize instances of the bandit class.
agents_lst (list): A list of dictionaries, each specifying an agent class and its arguments.
experiments_lst (list): A list of Experiment instances, each corresponding to an agent.
results_dct (dict): A dictionary storing the results of the simulations for each agent.
results_template (NoneType): Placeholder for future use.
Args:
runs (int): Number of runs to simulate.
steps (int): Number of steps per run.
arms (int): Number of arms in the bandit problem.
bandit (Bandit): The bandit class to be used in the experiments.
bandit_args (dict): Initialization arguments for the bandit class.
agents (list): List of dictionaries specifying agents and their initialization arguments.
"""
def __init__(self, runs, steps, arms, bandit, bandit_args, agents):
self.runs = runs
self.steps = steps
self.num_arms = arms
self.bandit = bandit
self.bandit_args = bandit_args
self.agents_lst = agents
self.experiments_lst = []
self.results_dct = {}
self.results_template = None
self.bandit_name = None
self.build_experiments()
def build_experiments(self):
"""
Constructs Experiment instances for each specified agent and bandit configuration.
"""
for agent_dct in self.agents_lst:
bandit_instance = self.bandit(num_arms=self.num_arms, **self.bandit_args)
agent_instance = agent_dct['agent'](self.num_arms, **agent_dct['args'])
exp = Experiment(agent=agent_instance, bandit=bandit_instance, steps=self.steps)
self.experiments_lst.append(exp)
self.results_dct[exp.name] = {
'optimal_action_count': np.zeros((self.runs, self.steps)),
'avg_rewards': np.zeros((self.runs, self.steps)),
'optimal_value': np.zeros((self.runs, self.steps)),
}
self.bandit_name = bandit_instance.name
def run(self):
"""
Executes each experiment across the specified number of runs, collecting performance metrics.
"""
for exp in self.experiments_lst:
for r in range(self.runs):
exp.run()
self.results_dct[exp.name]['avg_rewards'][r] = exp.reward_hist
self.results_dct[exp.name]['optimal_action_count'][r] = exp.optimal_actions
self.results_dct[exp.name]['optimal_value'][r] = exp.optimal_values
exp.agent.reset()
exp.bandit.reset()
def visualise(self, window=10):
"""
Visualizes the results of the simulations, including average rewards, optimal action counts,
cumulative rewards, reward variability, cumulative regret, and instantaneous regret.
Args:
window (int): The size of the moving average window for smoothing variability and instantaneous regret plots.
"""
# Plot setup
fig, axes = plt.subplots(6, 1, figsize=(10, 24))
steps = np.arange(0, self.steps)
linewidth = 1
conv_kernel = np.ones(window)/window
for name, dct in self.results_dct.items():
axes[0].plot(steps, dct['avg_rewards'].mean(0), label=name, linewidth=linewidth)
axes[1].plot(steps, dct['optimal_action_count'].mean(0), label=name, linewidth=linewidth)
axes[2].plot(steps, np.cumsum(dct['avg_rewards'].mean(0), axis=0), label=name, linewidth=linewidth)
reward_variability = np.std(dct['avg_rewards'], axis=0)
reward_variability = np.convolve(reward_variability, conv_kernel, mode='valid')
axes[3].plot(steps[:len(reward_variability)], reward_variability[:self.steps], label=name, linewidth=linewidth)
optimal_reward_cum = np.cumsum(dct['optimal_value'].mean(0), axis=0)
actual_rewards_cum = np.cumsum(dct['avg_rewards'].mean(0), axis=0)
regret_cum = optimal_reward_cum - actual_rewards_cum
axes[4].plot(regret_cum, label=name, linewidth=linewidth)
regret_inst = np.convolve(np.diff(regret_cum), conv_kernel, mode='valid')
axes[5].plot(steps[:len(regret_inst)], regret_inst[:self.steps], label=name, linewidth=linewidth)
# Plot average rewards
axes[0].set_title('Average reward')
axes[0].set_xlabel('Steps')
axes[0].set_ylabel('Average reward')
# Plot percentage of optimal actions
axes[1].set_title('Percentage of optimal action')
axes[1].set_xlabel('Steps')
axes[1].set_ylabel('% Optimal action')
axes[2].set_title('Cumsum Reward')
axes[2].set_xlabel('Steps')
axes[2].set_ylabel('Reward')
# Plot percentage of optimal actions
axes[3].set_title(f'Reward Variability (window={window})')
axes[3].set_xlabel('Steps')
axes[3].set_ylabel('Stddev')
axes[4].set_title('Cumulative Regret')
axes[4].set_xlabel('Steps')
axes[4].set_ylabel('Cumulative Regret')
axes[5].set_title(f'Instantaneous Regret (window={window})')
axes[5].set_xlabel('Steps')
axes[5].set_ylabel('Instantaneous Regret')
for ax in axes:
ax.legend()
ax.grid()
fig.suptitle(f'{self.bandit_name}', fontsize=16)
plt.tight_layout()
fig.subplots_adjust(top=.96)
plt.show()