forked from YangLing0818/buffer-of-thought-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
validate_results.py
52 lines (43 loc) · 1.52 KB
/
validate_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import json
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--task_name',type=str,default='gameof24',choices=['gameof24','checkmate','wordsorting'])
if __name__ == "__main__":
args = parser.parse_args()
task = args.task_name
benchmark_path_dict = {
'gameof24':'benchmarks/gameof24.jsonl',
'checkmate':'benchmarks/CheckmateInOne.jsonl',
'wordsorting':'benchmarks/word_sorting.jsonl'
}
test_path_dict = {
'gameof24':'test_results/BoT_gameof24.jsonl',
'checkmate':'test_results/BoT_checkmate.jsonl',
'wordsorting':'test_results/BoT_wordsorting.jsonl'
}
benchmark_path = benchmark_path_dict[task]
test_path = test_path_dict[task]
correct = 0
truth = []
test = []
for line in (open(benchmark_path)):
answer = json.loads(line)['target']
truth.append(answer)
for line in (open(test_path)):
result = json.loads(line)['result']
result = result.split('\n')[0]
if task == 'gameof24':
result = result.split('=')[0]
test.append(result)
try:
if eval(result) == 24:
correct += 1
except:
continue
else:
test.append(result)
if correct == 0:
for i in range(len(test)):
if truth[i] == test[i]:
correct += 1
print(f'correct number:{correct},accuracy:{correct/len(test)}')