Init commit. Add Evaluators and support ChatGLM/ChatGLM2.
parent
620be4a685
commit
299beda00e
@ -1,3 +1,12 @@
|
||||
# LLM_Evaluator
|
||||
|
||||
A simple program to evaluate large language model.
|
||||
|
||||
## Recommend Requirements
|
||||
|
||||
- Python 3.8
|
||||
- torch 1.13.1+cu117
|
||||
- transformers 4.33.2
|
||||
- accelerate 0.26.1
|
||||
- tqdm 4.66.1
|
||||
- openai 1.10.0
|
||||
|
@ -0,0 +1,97 @@
|
||||
import os
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import torch
|
||||
# from evaluators.chatgpt import ChatGPT_Evaluator
|
||||
from evaluators.chatglm import ChatGLM_Evaluator
|
||||
from evaluators.chatglm2 import ChatGLM_Evaluator as ChatGLM2_Evaluator
|
||||
|
||||
import time
|
||||
choices = ["A", "B", "C", "D"]
|
||||
|
||||
def main(args):
|
||||
|
||||
if "turbo" in args.model_name or "gpt-4" in args.model_name:
|
||||
print("Not supported yet")
|
||||
return -1
|
||||
# evaluator=ChatGPT_Evaluator(
|
||||
# choices=choices,
|
||||
# k=args.ntrain,
|
||||
# api_key=args.openai_key,
|
||||
# model_name=args.model_name
|
||||
# )
|
||||
elif "chatglm2" in args.model_name:
|
||||
if args.cuda_device:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
|
||||
device = torch.device("cuda")
|
||||
if args.finetune:
|
||||
fine_tune_model = args.finetune
|
||||
else:
|
||||
fine_tune_model = None
|
||||
evaluator=ChatGLM2_Evaluator(
|
||||
choices=choices,
|
||||
k=args.ntrain,
|
||||
model_name=args.model_name,
|
||||
device=device,
|
||||
finetune=fine_tune_model
|
||||
)
|
||||
elif "chatglm" in args.model_name:
|
||||
if args.cuda_device:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
|
||||
device = torch.device("cuda")
|
||||
if args.finetune:
|
||||
fine_tune_model = args.finetune
|
||||
else:
|
||||
fine_tune_model = None
|
||||
evaluator=ChatGLM_Evaluator(
|
||||
choices=choices,
|
||||
k=args.ntrain,
|
||||
model_name=args.model_name,
|
||||
device=device,
|
||||
finetune=fine_tune_model
|
||||
)
|
||||
else:
|
||||
print("Unknown model name")
|
||||
return -1
|
||||
|
||||
if not os.path.exists(r"logs"):
|
||||
os.mkdir(r"logs")
|
||||
run_date=time.strftime('%Y-%m-%d_%H-%M-%S',time.localtime(time.time()))
|
||||
if args.finetune:
|
||||
fine_tune_model_name = args.finetune
|
||||
else:
|
||||
fine_tune_model_name = 'original'
|
||||
save_result_dir=os.path.join(r"logs",f"{args.model_name}_{fine_tune_model_name}_{run_date}")
|
||||
os.mkdir(save_result_dir)
|
||||
|
||||
subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
|
||||
|
||||
for subject_name in subject_list:
|
||||
print(subject_name)
|
||||
# subject_name=args.subject
|
||||
val_file_path=os.path.join('data/val',f'{subject_name}_val.csv')
|
||||
val_df=pd.read_csv(val_file_path)
|
||||
if args.few_shot:
|
||||
dev_file_path=os.path.join('data/dev',f'{subject_name}_dev.csv')
|
||||
dev_df=pd.read_csv(dev_file_path)
|
||||
correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot,save_result_dir=save_result_dir,cot=args.cot)
|
||||
else:
|
||||
correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot,save_result_dir=save_result_dir)
|
||||
print("Acc:",correct_ratio)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ntrain", "-k", type=int, default=5)
|
||||
parser.add_argument("--openai_key", type=str,default="xxx")
|
||||
parser.add_argument("--minimax_group_id", type=str,default="xxx")
|
||||
parser.add_argument("--minimax_key", type=str,default="xxx")
|
||||
parser.add_argument("--few_shot", action="store_true")
|
||||
parser.add_argument("--model_name",type=str)
|
||||
parser.add_argument("--cot",action="store_true")
|
||||
# parser.add_argument("--subject","-s",type=str,default="operating_system")
|
||||
parser.add_argument("--cuda_device", type=str)
|
||||
parser.add_argument("--finetune", type=str)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -0,0 +1,164 @@
|
||||
import os
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModel, AutoConfig
|
||||
from transformers.generation.logits_process import LogitsProcessor
|
||||
from transformers.generation.utils import LogitsProcessorList
|
||||
from evaluators.evaluator import Evaluator
|
||||
|
||||
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
||||
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
||||
scores.zero_()
|
||||
scores[..., 5] = 5e4
|
||||
return scores
|
||||
|
||||
class ChatGLM_Evaluator(Evaluator):
|
||||
def __init__(self, choices, k, model_name, device, finetune=None):
|
||||
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
|
||||
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
|
||||
# or directly clone the model
|
||||
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna")
|
||||
if finetune:
|
||||
CHECKPOINT_PATH="ptuning/" + finetune
|
||||
config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128)
|
||||
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True)
|
||||
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
|
||||
new_prefix_state_dict = {}
|
||||
for k, v in prefix_state_dict.items():
|
||||
if k.startswith("transformer.prefix_encoder."):
|
||||
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
|
||||
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
|
||||
self.model = self.model.half().to(device)
|
||||
self.model.transformer.prefix_encoder.float()
|
||||
print("Model loaded! use GLM + " + finetune)
|
||||
else:
|
||||
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device)
|
||||
print("Model loaded!(GLM)")
|
||||
|
||||
# self.model = self.model.eval()
|
||||
|
||||
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
|
||||
correct_num = 0
|
||||
if save_result_dir:
|
||||
if few_shot:
|
||||
result = []
|
||||
score = []
|
||||
if few_shot:
|
||||
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
|
||||
else:
|
||||
history = []
|
||||
answers = list(test_df['answer'])
|
||||
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
|
||||
question = self.format_example(row, include_answer=False, cot=cot)
|
||||
if few_shot:
|
||||
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
|
||||
response = response.strip()
|
||||
# For ChatGLM, we use answer extraction in answer-only mode too.
|
||||
ans, direct_extract = self.extract_cot_answer(row, response)
|
||||
else: # zero-shot by extracting answer from distribution
|
||||
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history)
|
||||
if ans == answers[row_index]:
|
||||
correct_num += 1
|
||||
correct = 1
|
||||
else:
|
||||
correct = 0
|
||||
if save_result_dir:
|
||||
if few_shot:
|
||||
result.append(response)
|
||||
score.append(correct)
|
||||
correct_ratio = 100*correct_num/len(answers)
|
||||
|
||||
if save_result_dir:
|
||||
if few_shot:
|
||||
test_df['model_output'] = result
|
||||
test_df['correctness'] = score
|
||||
test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_test.csv'))
|
||||
|
||||
return correct_ratio
|
||||
|
||||
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
|
||||
message = []
|
||||
k = self.k
|
||||
if self.k == -1:
|
||||
k = dev_df.shape[0]
|
||||
message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
|
||||
for i in range(1, k):
|
||||
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
|
||||
return message
|
||||
|
||||
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
|
||||
example = add_prompt + line['question']
|
||||
# print(example)
|
||||
for choice in self.choices:
|
||||
example += f'\n{choice}. {line[f"{choice}"]}'
|
||||
example += '\n答案:'
|
||||
if include_answer:
|
||||
if cot:
|
||||
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。"
|
||||
else:
|
||||
ans = line["answer"]
|
||||
m = (example, ans)
|
||||
return m
|
||||
return example
|
||||
|
||||
def extract_cot_answer(self, line, gen_ans):
|
||||
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
|
||||
if len(m) > 0 and m[-1] in self.choices:
|
||||
return m[-1], True
|
||||
answer_patterns = [
|
||||
r'([ABCD])是正确的',
|
||||
r'选项([ABCD])正确',
|
||||
r'答案为([ABCD])',
|
||||
r'答案是([ABCD])',
|
||||
r'答案([ABCD])',
|
||||
r'选择([ABCD])',
|
||||
r'答案:([ABCD])',
|
||||
r'选择答案([ABCD])'
|
||||
]
|
||||
# RE extraction
|
||||
for answer_pattern in answer_patterns:
|
||||
m = re.search(answer_pattern, gen_ans, re.M)
|
||||
if m:
|
||||
answer = m.group(1)
|
||||
return answer, False
|
||||
# only containing one choice-character
|
||||
m = re.findall(r'[ABCD]', gen_ans, re.M)
|
||||
if len(m) == 1:
|
||||
answer = m[0]
|
||||
return answer, False
|
||||
answer_word_counter = 0
|
||||
# only containing one choice-context
|
||||
for c in self.choices:
|
||||
if str(line[f'{c}']) in gen_ans:
|
||||
answer = c
|
||||
answer_word_counter += 1
|
||||
if answer_word_counter == 1:
|
||||
return answer, False
|
||||
return '-', False
|
||||
|
||||
def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048,
|
||||
do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
|
||||
if history is None:
|
||||
history = []
|
||||
if logits_processor is None:
|
||||
logits_processor = LogitsProcessorList()
|
||||
logits_processor.append(InvalidScoreLogitsProcessor())
|
||||
gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048,
|
||||
"temperature": temperature, "logits_processor": logits_processor, **kwargs}
|
||||
if not history:
|
||||
prompt = query
|
||||
else:
|
||||
prompt = ""
|
||||
for i, (old_query, response) in enumerate(history):
|
||||
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
|
||||
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = inputs.to(model.device)
|
||||
outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs)
|
||||
|
||||
score = outputs.scores[0][0].tolist()
|
||||
choice_score = [score[167], score[333], score[251], score[416]]
|
||||
ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)]
|
||||
return self.choices[ranked_index[0]]
|
@ -0,0 +1,155 @@
|
||||
import os
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModel, AutoConfig
|
||||
from transformers.generation.logits_process import LogitsProcessor
|
||||
from transformers.generation.utils import LogitsProcessorList
|
||||
from evaluators.evaluator import Evaluator
|
||||
from peft import PeftModel
|
||||
|
||||
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
||||
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
||||
scores.zero_()
|
||||
scores[..., 5] = 5e4
|
||||
return scores
|
||||
|
||||
class ChatGLM_Evaluator(Evaluator):
|
||||
def __init__(self, choices, k, model_name, device, finetune=None):
|
||||
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
|
||||
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
|
||||
# or directly clone the model
|
||||
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna")
|
||||
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device)
|
||||
if finetune:
|
||||
peft_model_id = "lora/" + finetune
|
||||
self.model = PeftModel.from_pretrained(self.model, peft_model_id)
|
||||
print("Model loaded! use GLM2" + finetune)
|
||||
else:
|
||||
print("Model loaded!(GLM2)")
|
||||
# self.model = self.model.eval()
|
||||
|
||||
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
|
||||
correct_num = 0
|
||||
if save_result_dir:
|
||||
if few_shot:
|
||||
result = []
|
||||
score = []
|
||||
if few_shot:
|
||||
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
|
||||
else:
|
||||
history = []
|
||||
answers = list(test_df['answer'])
|
||||
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
|
||||
question = self.format_example(row, include_answer=False, cot=cot)
|
||||
if few_shot:
|
||||
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
|
||||
response = response.strip()
|
||||
# For ChatGLM, we use answer extraction in answer-only mode too.
|
||||
ans, direct_extract = self.extract_cot_answer(row, response)
|
||||
else: # zero-shot by extracting answer from distribution
|
||||
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history)
|
||||
if ans == answers[row_index]:
|
||||
correct_num += 1
|
||||
correct = 1
|
||||
else:
|
||||
correct = 0
|
||||
if save_result_dir:
|
||||
if few_shot:
|
||||
result.append(response)
|
||||
score.append(correct)
|
||||
correct_ratio = 100*correct_num/len(answers)
|
||||
|
||||
if save_result_dir:
|
||||
if few_shot:
|
||||
test_df['model_output'] = result
|
||||
test_df['correctness'] = score
|
||||
test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_{correct_ratio}_test.csv'))
|
||||
|
||||
return correct_ratio
|
||||
|
||||
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
|
||||
message = []
|
||||
k = self.k
|
||||
if self.k == -1:
|
||||
k = dev_df.shape[0]
|
||||
message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
|
||||
for i in range(1, k):
|
||||
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
|
||||
return message
|
||||
|
||||
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
|
||||
example = add_prompt + line['question']
|
||||
# print(example)
|
||||
for choice in self.choices:
|
||||
example += f'\n{choice}. {line[f"{choice}"]}'
|
||||
example += '\n答案:'
|
||||
if include_answer:
|
||||
if cot:
|
||||
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。"
|
||||
else:
|
||||
ans = line["answer"]
|
||||
m = (example, ans)
|
||||
return m
|
||||
return example
|
||||
|
||||
def extract_cot_answer(self, line, gen_ans):
|
||||
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
|
||||
if len(m) > 0 and m[-1] in self.choices:
|
||||
return m[-1], True
|
||||
answer_patterns = [
|
||||
r'([ABCD])是正确的',
|
||||
r'选项([ABCD])正确',
|
||||
r'答案为([ABCD])',
|
||||
r'答案是([ABCD])',
|
||||
r'答案([ABCD])',
|
||||
r'选择([ABCD])',
|
||||
r'答案:([ABCD])',
|
||||
r'选择答案([ABCD])'
|
||||
]
|
||||
# RE extraction
|
||||
for answer_pattern in answer_patterns:
|
||||
m = re.search(answer_pattern, gen_ans, re.M)
|
||||
if m:
|
||||
answer = m.group(1)
|
||||
return answer, False
|
||||
# only containing one choice-character
|
||||
m = re.findall(r'[ABCD]', gen_ans, re.M)
|
||||
if len(m) == 1:
|
||||
answer = m[0]
|
||||
return answer, False
|
||||
answer_word_counter = 0
|
||||
# only containing one choice-context
|
||||
for c in self.choices:
|
||||
if str(line[f'{c}']) in gen_ans:
|
||||
answer = c
|
||||
answer_word_counter += 1
|
||||
if answer_word_counter == 1:
|
||||
return answer, False
|
||||
return '-', False
|
||||
|
||||
def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048,
|
||||
do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
|
||||
if history is None:
|
||||
history = []
|
||||
if logits_processor is None:
|
||||
logits_processor = LogitsProcessorList()
|
||||
logits_processor.append(InvalidScoreLogitsProcessor())
|
||||
gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048,
|
||||
"temperature": temperature, "logits_processor": logits_processor, **kwargs}
|
||||
if not history:
|
||||
prompt = query
|
||||
else:
|
||||
prompt = ""
|
||||
for i, (old_query, response) in enumerate(history):
|
||||
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
|
||||
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = inputs.to(model.device)
|
||||
outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs)
|
||||
|
||||
score = outputs.scores[0][0].tolist()
|
||||
choice_score = [score[167], score[333], score[251], score[416]]
|
||||
ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)]
|
||||
return self.choices[ranked_index[0]]
|
@ -0,0 +1,169 @@
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import openai
|
||||
from evaluators.evaluator import Evaluator
|
||||
from time import sleep
|
||||
import re
|
||||
|
||||
|
||||
class ChatGPT_Evaluator(Evaluator):
|
||||
def __init__(self, choices, k, api_key,model_name):
|
||||
super(ChatGPT_Evaluator, self).__init__(choices, model_name, k)
|
||||
openai.api_key = api_key
|
||||
|
||||
def format_example(self,line,include_answer=True,cot=False):
|
||||
example=line['question']
|
||||
for choice in self.choices:
|
||||
example+=f'\n{choice}. {line[f"{choice}"]}'
|
||||
|
||||
example+='\n答案:'
|
||||
if include_answer:
|
||||
if cot:
|
||||
ans=line["answer"]
|
||||
content="让我们一步一步思考,\n"+line["explanation"]+f"\n所以答案是{ans}。"
|
||||
return [
|
||||
{"role":"user","content":example},
|
||||
{"role":"assistant","content":content}
|
||||
]
|
||||
else:
|
||||
return [
|
||||
{"role":"user","content":example},
|
||||
{"role":"assistant","content":line["answer"]}
|
||||
]
|
||||
else:
|
||||
return [
|
||||
{"role":"user","content":example},
|
||||
]
|
||||
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
|
||||
prompt=[
|
||||
{
|
||||
"role":"system",
|
||||
"content":f"你是一个中文人工智能助手,以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。"
|
||||
}
|
||||
]
|
||||
k=self.k
|
||||
if self.k==-1:
|
||||
k=dev_df.shape[0]
|
||||
for i in range(k):
|
||||
tmp=self.format_example(dev_df.iloc[i,:],include_answer=True,cot=cot)
|
||||
if i==0:
|
||||
tmp[0]["content"]=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"+tmp[0]["content"]
|
||||
prompt+=tmp
|
||||
return prompt
|
||||
|
||||
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None,cot=False):
|
||||
correct_num = 0
|
||||
if save_result_dir:
|
||||
result = []
|
||||
score=[]
|
||||
if few_shot:
|
||||
few_shot_prompt = self.generate_few_shot_prompt(subject_name, dev_df,cot=cot)
|
||||
else:
|
||||
few_shot_prompt=[
|
||||
{
|
||||
"role":"system",
|
||||
"content":f"你是一个中文人工智能助手,以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。"
|
||||
}
|
||||
]
|
||||
answers = list(test_df['answer'])
|
||||
for row_index, row in tqdm(test_df.iterrows(),total=len(test_df)):
|
||||
question = self.format_example(row, include_answer=False)
|
||||
full_prompt = few_shot_prompt + question
|
||||
if not few_shot:
|
||||
full_prompt[-1]["content"]=f"以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n\n"+full_prompt[-1]["content"]
|
||||
response=None
|
||||
timeout_counter=0
|
||||
while response is None and timeout_counter<=30:
|
||||
try:
|
||||
response = openai.ChatCompletion.create(
|
||||
model=self.model_name,
|
||||
messages=full_prompt,
|
||||
temperature=0.
|
||||
)
|
||||
except Exception as msg:
|
||||
if "timeout=600" in str(msg):
|
||||
timeout_counter+=1
|
||||
print(msg)
|
||||
sleep(5)
|
||||
continue
|
||||
if response==None:
|
||||
response_str=""
|
||||
else:
|
||||
response_str = response['choices'][0]['message']['content']
|
||||
#print(response_str)
|
||||
if cot:
|
||||
ans_list=re.findall(r"答案是(.+?)。",response_str)
|
||||
if len(ans_list)==0:
|
||||
ans_list=re.findall(r"答案为(.+?)。",response_str)
|
||||
if len(ans_list)==0:
|
||||
ans_list=re.findall(r"选项(.+?)是正确的。",response_str)
|
||||
|
||||
if len(ans_list)==0:
|
||||
correct=0
|
||||
else:
|
||||
if self.exact_match(ans_list[-1],row["answer"]):
|
||||
correct_num+=1
|
||||
correct=1
|
||||
else:
|
||||
correct=0
|
||||
else:
|
||||
response_str=response_str.strip()
|
||||
if few_shot:
|
||||
if len(response_str)>0:
|
||||
if self.exact_match(response_str,row["answer"]):
|
||||
correct_num+=1
|
||||
correct=1
|
||||
else:
|
||||
correct=0
|
||||
else:
|
||||
correct=0
|
||||
else:
|
||||
if len(response_str)>0:
|
||||
ans_list=self.extract_ans(response_str)
|
||||
if len(ans_list)>0 and (ans_list[-1]==row["answer"]):
|
||||
correct_num+=1
|
||||
correct=1
|
||||
else:
|
||||
correct=0
|
||||
else:
|
||||
correct=0
|
||||
if save_result_dir:
|
||||
result.append(response_str)
|
||||
score.append(correct)
|
||||
correct_ratio = 100*correct_num/len(answers)
|
||||
|
||||
if save_result_dir:
|
||||
test_df['model_output']=result
|
||||
test_df["correctness"]=score
|
||||
test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_val.csv'),encoding="utf-8",index=False)
|
||||
return correct_ratio
|
||||
|
||||
def extract_ans(self,response_str):
|
||||
pattern=[
|
||||
r"^选([A-D])",
|
||||
r"^选项([A-D])",
|
||||
r"答案是\s?选?项?\s?([A-D])",
|
||||
r"答案为\s?选?项?\s?([A-D])",
|
||||
r"答案应为\s?选?项?\s?([A-D])",
|
||||
r"答案选\s?选?项?\s?([A-D])",
|
||||
r"答案是:\s?选?项?\s?([A-D])",
|
||||
r"答案应该是:\s?选?项?\s?([A-D])",
|
||||
r"正确的一项是\s?([A-D])",
|
||||
r"答案为:\s?选?项?\s?([A-D])",
|
||||
r"答案应为:\s?选?项?\s?([A-D])",
|
||||
r"答案:\s?选?项?\s?([A-D])",
|
||||
r"答案是:\s?选?项?\s?([A-D])",
|
||||
r"答案应该是:\s?选?项?\s?([A-D])",
|
||||
r"答案为:\s?选?项?\s?([A-D])",
|
||||
r"答案应为:\s?选?项?\s?([A-D])",
|
||||
r"答案:\s?选?项?\s?([A-D])",
|
||||
]
|
||||
ans_list=[]
|
||||
if response_str[0] in ["A",'B','C','D']:
|
||||
ans_list.append(response_str[0])
|
||||
for p in pattern:
|
||||
if len(ans_list)==0:
|
||||
ans_list=re.findall(p,response_str)
|
||||
else:
|
||||
break
|
||||
return ans_list
|
@ -0,0 +1,47 @@
|
||||
import re
|
||||
import string
|
||||
class Evaluator:
|
||||
def __init__(self, choices, model_name, k=-1):
|
||||
self.choices = choices
|
||||
self.model_name = model_name
|
||||
self.k = k
|
||||
self.puncs = list(string.punctuation)
|
||||
|
||||
def format_example(self, line, include_answer=True):
|
||||
example = line['question']
|
||||
# print(example)
|
||||
for choice in self.choices:
|
||||
example += f'\n{choice}. {line[f"{choice}"]}'
|
||||
example += '\n答案:'
|
||||
if include_answer:
|
||||
example += f'{line["answer"]}\n\n'
|
||||
return example
|
||||
|
||||
def generate_few_shot_prompt(self, subject, dev_df):
|
||||
prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"
|
||||
k = self.k
|
||||
if self.k == -1:
|
||||
k = dev_df.shape[0]
|
||||
for i in range(k):
|
||||
prompt += self.format_example(dev_df.iloc[i, :])
|
||||
return prompt
|
||||
|
||||
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None):
|
||||
pass
|
||||
|
||||
def normalize_answer(self,s):
|
||||
|
||||
def white_space_fix(text):
|
||||
return ' '.join(text.split())
|
||||
|
||||
def remove_punc(text):
|
||||
exclude=set(self.puncs)
|
||||
return ''.join(ch for ch in text if ch not in exclude)
|
||||
|
||||
def lower(text):
|
||||
return text.lower()
|
||||
|
||||
return white_space_fix(remove_punc(lower(s)))
|
||||
|
||||
def exact_match(self,pred, target):
|
||||
return self.normalize_answer(pred)==self.normalize_answer(target)
|
@ -0,0 +1,37 @@
|
||||
import pandas as pd
|
||||
|
||||
# 读取CSV文件
|
||||
df = pd.read_csv("data/val/car_use_val.csv")
|
||||
correct_num = 0
|
||||
total_num = 0
|
||||
|
||||
# 遍历每一行并生成选择题文本
|
||||
for index, row in df.iterrows():
|
||||
question_text = row['question']
|
||||
options = [row['A'], row['B'], row['C'], row['D']]
|
||||
answer = row['answer']
|
||||
|
||||
# 生成选择题文本
|
||||
question_text = f"{question_text}\n"
|
||||
for i, option in enumerate(options):
|
||||
question_text += f"{chr(65 + i)}. {option}\n"
|
||||
|
||||
# 打印生成的选择题
|
||||
print(f"问题 {index + 1}:")
|
||||
print(question_text)
|
||||
user_answer = input("请输入你的答案: ")
|
||||
df.loc[index, 'user_answer'] = user_answer
|
||||
print(f"答案: {answer}\n")
|
||||
total_num += 1
|
||||
if user_answer == answer:
|
||||
print("回答正确!\n")
|
||||
correct_num += 1
|
||||
else:
|
||||
print("回答错误!\n")
|
||||
|
||||
# 计算正确率
|
||||
correct_ratio = 100 * correct_num / total_num
|
||||
print(f"正确率: {correct_ratio}%")
|
||||
#结果保存到文件
|
||||
df.to_csv("logs/car_use_val_gpt3.5_" + str(correct_ratio) + ".csv")
|
||||
|
Loading…
Reference in New Issue