-
Notifications
You must be signed in to change notification settings - Fork 0
/
judge.py
68 lines (53 loc) · 2.74 KB
/
judge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
from vllm import LLM, SamplingParams
### templates
template_w_ref = "Please act as a precise judge and evaluate the quality of the answer to question. Rate the answer from 0 to 9, where a higher value means a better answer. Please refer to the reference answer to make your judgement. Respond with an integer between 0 and 9. \n\n[Question]\n{instruction}\n\n[Reference]\n{reference}\n\n[Answer]\n{input}"
template_wo_ref = "Please act as a precise judge and evaluate the quality of the answer to question. Rate the answer from 0 to 9, where a higher value means a better answer. Please respond with an integer between 0 and 9. \n\n[Question]\n{instruction}\n\n[Answer]\n{input}"
def evaluate(prompts):
sampling_params = SamplingParams(temperature=0,
top_p=1,
top_k=-1,
max_tokens=10,
stop = '</s>',
presence_penalty=1.2,
logprobs=20,
use_beam_search=True,
best_of=2
)
outputs = model.generate(prompts, sampling_params)
outputs = [ (int(output.request_id), output) for output in outputs]
sorted_list = sorted(outputs, key=lambda x: x[0])
outputs = [x[1] for x in sorted_list]
scores = []
for output in outputs:
logprobs = output.outputs[0].logprobs
class_ids = [29900, 29896, 29906, 29941, 29946, 29945, 29953, 29955, 29947, 29929]
class_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
try:
class_probs = []
for class_id in class_ids:
temp_list = [v for k, v in logprobs[0].items() if class_id == k]
if len(temp_list) == 0:
class_probs.append(0)
else:
class_probs.append( np.exp(temp_list[0]) )
class_probs = np.array(class_probs) / np.sum(class_probs)
score = np.sum(class_probs * np.array(class_values))
except Exception as e:
print(f'{e}')
score = 0.
scores.append(score)
return scores
### load the model
path_to_model=''
model = LLM(model=path_to_model)
### test example
question = ''
response = ''
reference = '' ## it will be reference-based evaluation if the reference is given
## reference-based evaluation
prompt = template_w_ref.format(instruction=question, reference=reference, input=response) ## form the data sample
## reference-free evaluation
prompt = template_wo_ref.format(instruction=question, input=response) ## form the data sample
## generate the score
score = evaluate([prompt])