-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheval.py
285 lines (243 loc) · 13.1 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# Copyright (c) 2023 Contextual AI, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""
Main script for running evals. This will run an eval according to the specified config, which should be a YAML file generated during training.
You must override the mode from 'train' to one of 'sample', 'eval', or 'alpacaeval'.
Overriding the other config parameters is optional.
For sampling, do something like:
python eval.py --config-path=/data/models/archangel/archangel_sft_pythia1-4b ++mode=sample ++n_samples=512 ++model.eval_batch_size=32
For calculating the batch metrics (e.g., accuracy of predicted preference direction when preference is inferred from DPO rewards) on a held-out set:
python eval.py --config-path=/data/models/archangel/archangel_sft_pythia1-4b ++mode=eval
To sample from the unaligned model (e.g., the original EleutherAI/pythia1-4b), add ++saved_policy=null to the command.
To sample from every prompt (without limit), set ++n_samples=null
"""
import torch
torch.backends.cuda.matmul.allow_tf32 = True
import transformers
from transformers import set_seed
from utils import disable_dropout, init_distributed, get_open_port, rank0_print, get_batch_logps
import os
import hydra
import torch.multiprocessing as mp
from omegaconf import OmegaConf, DictConfig
import json
import socket
from typing import Optional, Set
from trainers import BasicTrainer, DPOTrainer, PairedPreferenceTrainer
import dataloader
from datetime import datetime
from collections import defaultdict
import pandas as pd
@hydra.main(version_base=None,
config_path="",
config_name="config")
def main(config: DictConfig):
"""Main entry point for evaluating. Validates config, loads model(s), and kicks off worker process(es)."""
# Resolve hydra references, e.g. so we don't re-compute the run directory
OmegaConf.resolve(config)
print(OmegaConf.to_yaml(config))
if config.mode not in ['sample', 'eval', 'alpacaeval', 'sample_ufb', 'sample_shp', 'margin_probs']:
raise Exception("This is a script for eval/sampling. config.mode should be one of 'sample', 'eval', 'sample_ufb', 'sample_shp', or 'alpacaeval'")
set_seed(config.seed)
print('=' * 80)
print(f'Writing to', config.samples_dir)
print('=' * 80)
# purely inference, so put as much as possible onto the first gpu
model_kwargs = {'device_map': "balanced_low_0"}
tokenizer_name_or_path = config.local_run_dir or config.model.tokenizer_name_or_path # first see if saved tokenizer is in the experiment directory
print(f'Loading tokenizer at {tokenizer_name_or_path}')
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name_or_path)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
print('building policy')
policy_dtype = getattr(torch, config.model.policy_dtype)
policy = transformers.AutoModelForCausalLM.from_pretrained(
config.model.name_or_path, low_cpu_mem_usage=True, use_flash_attention_2=config.model.use_flash_attention, torch_dtype=policy_dtype, **model_kwargs)
# note that models were only resized for csft before saving
# important because number of tokens in pretrained tokenizer is different from model.config.vocab_size,
# so resizing at eval will throw an error if not resized before training
if config.loss.name == 'csft':
policy.resize_token_embeddings(len(tokenizer)) # model being loaded should already be trained with additional tokens for this to be valid
disable_dropout(policy)
# saved policy can be force set to null to sample from pretrained model
if config.saved_policy is not None:
state_dict = torch.load(os.path.join(config.saved_policy), map_location='cpu')
step, metrics = state_dict['step_idx'], state_dict['metrics']
print(f'loading pre-trained weights for policy at step {step} from {config.saved_policy} with metrics {json.dumps(metrics, indent=2)}')
policy.load_state_dict(state_dict['state'])
if config.mode in ['eval', 'margin_probs'] and config.loss.use_reference_model:
print('building reference model')
reference_model_dtype = getattr(torch, config.model.reference_dtype)
reference_model = transformers.AutoModelForCausalLM.from_pretrained(
config.model.name_or_path, low_cpu_mem_usage=True, use_flash_attention_2=config.model.use_flash_attention, torch_dtype=reference_model_dtype, **model_kwargs)
if config.loss.name == 'ppo':
reference_model.resize_token_embeddings(len(tokenizer))
disable_dropout(reference_model)
if config.model.load_from is not None:
state_dict = torch.load(os.path.join(config.cache_dir, config.model.load_from), map_location='cpu')
step, metrics = state_dict['step_idx'], state_dict['metrics']
print(f'loading pre-trained weights for reference at step {step} from {config.model.load_from} with metrics {json.dumps(metrics, indent=2)}')
reference_model.load_state_dict(state_dict['state'])
else:
reference_model = None
data_loader_class = getattr(dataloader, config.loss.dataloader)
data_iterator_kwargs = dict(
max_length=config.model.max_length,
max_prompt_length=config.model.max_prompt_length,
# since the human/asst fields are not in the configs of the already-released models, add defaults
human_prefix=config['human_prefix'],
human_suffix=config['human_suffix'],
assistant_prefix=config['assistant_prefix'],
assistant_suffix=config['assistant_suffix'],
seed=config.seed,
# the following kwargs can be used to make dataset imbalanced (only used by UnbalancedUnpairedPreferenceDataLoader)
frac_unique_desirable=config.get('frac_unique_desirable', 1.0),
frac_unique_undesirable=config.get('frac_unique_undesirable', 1.0),
# control tokens taken from Korbak et al.'s (2023) "Pretraining Models with Human Feedback"
# SFTDataLoader will use them for sampling; ConditionalSFTDataLoader for training
chosen_control_token=(config.loss.chosen_control_token if config.loss.name == "csft" else None),
rejected_control_token=(config.loss.rejected_control_token if config.loss.name == "csft" else None),
)
if config.mode == 'sample':
print(f'Loading dataloader')
os.makedirs(config.samples_dir, exist_ok=True)
# use the SFT dataloader because we don't want to repeat prompts
# and bc data ordering is different in paired vs unpaired data loaders
# this way, sampled prompts are the same for a given seed
eval_iterator = dataloader.SFTDataLoader(
config.datasets,
tokenizer,
split='test',
batch_size=config.model.eval_batch_size,
n_examples=config.n_samples,
max_prompt_count=1,
**data_iterator_kwargs
)
trainer = BasicTrainer(tokenizer, config, None, eval_iterator, policy, reference_model=reference_model)
samples = trainer.sample()
fn = os.path.join(config.samples_dir, f'{config.exp_name}.json')
json.dump({
'sampled_at' : str(datetime.now()),
'config' : OmegaConf.to_container(config, resolve=True),
'samples' : samples,
}, open(fn, 'w'), indent=2)
elif config.mode == 'alpacaeval':
print(f'Loading dataloader')
os.makedirs(config.samples_dir, exist_ok=True)
eval_iterator = dataloader.SFTDataLoader(
['alpacaeval'],
tokenizer,
split='test',
batch_size=config.model.eval_batch_size,
n_examples=None,
n_epochs=1,
**data_iterator_kwargs
)
trainer = BasicTrainer(tokenizer, config, None, eval_iterator, policy, reference_model=reference_model)
samples = trainer.sample(include_original_prompt=True)
alpaca_formatted_examples = []
for sample in samples:
alpaca_formatted_examples.append({
'instruction' : sample['original_prompt'],
'output': sample['policy'].strip(),
# 'reference' : sample['chosen'].strip(),
'generator': config.exp_name,
'dataset': 'helpful_base',
'datasplit': 'eval'
})
fn = os.path.join(config.samples_dir, f'alpaca_{config.exp_name}.json')
json.dump(alpaca_formatted_examples, open(fn, 'w'), indent=2)
elif config.mode == 'eval':
print(f'Loading dataloader')
eval_iterator = data_loader_class(
config.datasets,
tokenizer,
split='test',
batch_size=config.model.eval_batch_size,
n_examples=config.n_eval_examples,
n_epochs=(1 if config.n_eval_examples is None else None),
**data_iterator_kwargs
)
trainer = DPOTrainer(tokenizer, config, None, eval_iterator, policy, reference_model=reference_model)
results = trainer.eval()
rank0_print(results)
elif config.mode.startswith('sample_'):
print(f'Loading dataloader')
os.makedirs(config.samples_dir, exist_ok=True)
# use the SFT dataloader because we don't want to repeat prompts
# and bc data ordering is different in paired vs unpaired data loaders
# this way, sampled prompts are the same for a given seed
eval_iterator = dataloader.SFTDataLoader(
config.datasets,
tokenizer,
split='test',
batch_size=config.model.eval_batch_size,
n_examples=config.n_samples,
max_prompt_count=1,
alpacaeval=True,
**data_iterator_kwargs
)
trainer = BasicTrainer(tokenizer, config, None, eval_iterator, policy, reference_model=reference_model)
samples = trainer.sample(include_original_prompt=True)
fn = os.path.join(config.samples_dir, f'{config.exp_name}.json')
alpaca_formatted_examples = []
for sample in samples:
prompt = sample['prompt'].replace(config.human_prefix, '')
prompt = prompt.replace(config.assistant_prefix, '')
alpaca_formatted_examples.append({
'instruction' : prompt.strip(),
'output': sample['policy'].strip(),
'generator': config.exp_name,
'dataset': 'helpful_base',
'datasplit': 'eval',
'top_score_chosen': sample['best_chosen']
})
fn = os.path.join(config.samples_dir, f'{config.exp_name}.json')
json.dump(alpaca_formatted_examples, open(fn, 'w'), indent=2)
elif config.mode == 'margin_probs':
os.makedirs(config.samples_dir, exist_ok=True)
eval_iterator = dataloader.PairedPreferenceDataLoader(
config.datasets,
tokenizer,
split='test',
batch_size=config.model.eval_batch_size,
n_examples=config.n_samples,
max_prompt_count=1,
alpacaeval=True,
**data_iterator_kwargs
)
# trainer = PairedPreferenceTrainer(tokenizer, config, None, eval_iterator, policy, reference_model=reference_model)
policy.eval()
reference_model.eval()
eval_dict = defaultdict(list)
for eval_batch in list(eval_iterator):
for k in ['prompt_text', 'chosen_text', 'rejected_text', 'chosen_scores', 'rejected_scores']:
eval_dict[k].extend(eval_batch[k])
# policy_chosen_logps, policy_rejected_logps = trainer.forward(policy, eval_batch)
is_mistral = 'mistral' in config.model.name_or_path.lower()
for label in ['chosen', 'rejected']:
for model_name, model in zip(['policy', 'ref'], [policy, reference_model]):
all_logits = model(eval_batch[f'{label}_combined_input_ids'], attention_mask=eval_batch[f'{label}_combined_attention_mask'], use_cache=(not is_mistral)).logits
all_logps = get_batch_logps(all_logits, eval_batch[f'{label}_labels'], average_log_prob=False)
eval_dict[f"{model_name}_{label}_logps"].extend(all_logps.tolist())
eval_dict['chosen_rewards'] = (
torch.tensor(eval_dict['policy_chosen_logps']) - torch.tensor(eval_dict['ref_chosen_logps'])
).tolist()
eval_dict['rejected_rewards'] = (
torch.tensor(eval_dict['policy_rejected_logps']) - torch.tensor(eval_dict['ref_rejected_logps'])
).tolist()
eval_dict['reward_margins'] = (
torch.tensor(eval_dict['chosen_rewards']) - torch.tensor(eval_dict['rejected_rewards'])
).tolist()
eval_dict['score_margins'] = (
torch.tensor(eval_dict['chosen_scores']) - torch.tensor(eval_dict['rejected_scores'])
).tolist()
df = pd.DataFrame(eval_dict)
df.to_csv(config.samples_dir + f'/{config.datasets[0]}_margin_probs.csv', index=False)
else:
raise Exception("mode is neither sample nor eval")
if __name__ == '__main__':
main()