-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt_judge.py
84 lines (76 loc) · 2.99 KB
/
gpt_judge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sys
import openai
openai.api_base = ""
openai.api_key = ''
import time
import json
import tqdm
from multiprocessing import Pool
openai.api_base = ""
openai.api_key = ''
gpt_model = 'gpt-3.5-turbo'
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str)
parser.add_argument("--output_file", type=str)
args = parser.parse_args()
system_prompt = '''
You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:
------
##INSTRUCTIONS:
- Focus on the meaningful match between the predicted answer and the correct answer.
- Consider synonyms or paraphrases as valid matches.
- Evaluate the correctness of the prediction compared to the answer.
'''
def judge(ele):
template = '''Please evaluate the following video-based question-answer pair:
Question: {}
Correct Answer: {}
Predicted Answer: {}
If the predicted answer expresses the same meaning as the correct answer, please output 1; otherwise, output 0.
DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide 0 or 1.
'''
gpt_judge = []
prompt = template.format(ele['prompt'].replace("Answer with the option's letter from the given choices directly.", ""), ele['gt'], ele['pred'])
max_retries = 20
retry_delay = 5
retries = 0
output = None
while output is None and retries < max_retries:
try:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
output = openai.ChatCompletion.create(
model=gpt_model,
max_tokens=10,
temperature=0,
messages=messages)
if output is not None:
output = output['choices'][0]['message']['content']
else:
retries += 1
print(f"Attempt {retries}: Failed to get response, retrying after {retry_delay} seconds...")
time.sleep(retry_delay)
print(f"An error occurred: {e}")
retries += 1
print(f"Attempt {retries}: Exception encountered, retrying after {retry_delay} seconds...")
time.sleep(retry_delay)
if output is None:
print("Failed to get a valid response from the API after maximum retries.")
gpt_judge.append("No response")
else:
gpt_judge.append(output)
print(output)
ele['gpt_judge'] = gpt_judge
return ele
import os
if __name__ == "__main__":
output_file_path = args.output_file
output_file = open(output_file_path, 'a')
gpt_input = [json.loads(q) for q in open(os.path.expanduser(args.input_file), "r")]
with Pool(150) as p:
result = list(tqdm.tqdm(p.imap(judge, gpt_input), total=len(gpt_input)))
for ele in result:
output_file.write(json.dumps(ele)+"\n")