diff --git a/.gitignore b/.gitignore index 5d381cc..ff4bd03 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,10 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +/.idea/ +/THUDM/ +/THUDM/chatglm-6b/ +/lora/ +/ptuning/ +/logs/ +/data/ diff --git a/README.md b/README.md index 21cfef2..76dabfb 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ # LLM_Evaluator -A simple program to evaluate large language model. \ No newline at end of file +A simple program to evaluate large language model. + +## Recommend Requirements + +- Python 3.8 +- torch 1.13.1+cu117 +- transformers 4.33.2 +- accelerate 0.26.1 +- tqdm 4.66.1 +- openai 1.10.0 diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..d1b94df --- /dev/null +++ b/eval.py @@ -0,0 +1,97 @@ +import os +import argparse +import pandas as pd +import torch +# from evaluators.chatgpt import ChatGPT_Evaluator +from evaluators.chatglm import ChatGLM_Evaluator +from evaluators.chatglm2 import ChatGLM_Evaluator as ChatGLM2_Evaluator + +import time +choices = ["A", "B", "C", "D"] + +def main(args): + + if "turbo" in args.model_name or "gpt-4" in args.model_name: + print("Not supported yet") + return -1 + # evaluator=ChatGPT_Evaluator( + # choices=choices, + # k=args.ntrain, + # api_key=args.openai_key, + # model_name=args.model_name + # ) + elif "chatglm2" in args.model_name: + if args.cuda_device: + os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device + device = torch.device("cuda") + if args.finetune: + fine_tune_model = args.finetune + else: + fine_tune_model = None + evaluator=ChatGLM2_Evaluator( + choices=choices, + k=args.ntrain, + model_name=args.model_name, + device=device, + finetune=fine_tune_model + ) + elif "chatglm" in args.model_name: + if args.cuda_device: + os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device + device = torch.device("cuda") + if args.finetune: + fine_tune_model = args.finetune + else: + fine_tune_model = None + evaluator=ChatGLM_Evaluator( + choices=choices, + k=args.ntrain, + model_name=args.model_name, + device=device, + finetune=fine_tune_model + ) + else: + print("Unknown model name") + return -1 + + if not os.path.exists(r"logs"): + os.mkdir(r"logs") + run_date=time.strftime('%Y-%m-%d_%H-%M-%S',time.localtime(time.time())) + if args.finetune: + fine_tune_model_name = args.finetune + else: + fine_tune_model_name = 'original' + save_result_dir=os.path.join(r"logs",f"{args.model_name}_{fine_tune_model_name}_{run_date}") + os.mkdir(save_result_dir) + + subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] + + for subject_name in subject_list: + print(subject_name) + # subject_name=args.subject + val_file_path=os.path.join('data/val',f'{subject_name}_val.csv') + val_df=pd.read_csv(val_file_path) + if args.few_shot: + dev_file_path=os.path.join('data/dev',f'{subject_name}_dev.csv') + dev_df=pd.read_csv(dev_file_path) + correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot,save_result_dir=save_result_dir,cot=args.cot) + else: + correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot,save_result_dir=save_result_dir) + print("Acc:",correct_ratio) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--ntrain", "-k", type=int, default=5) + parser.add_argument("--openai_key", type=str,default="xxx") + parser.add_argument("--minimax_group_id", type=str,default="xxx") + parser.add_argument("--minimax_key", type=str,default="xxx") + parser.add_argument("--few_shot", action="store_true") + parser.add_argument("--model_name",type=str) + parser.add_argument("--cot",action="store_true") + # parser.add_argument("--subject","-s",type=str,default="operating_system") + parser.add_argument("--cuda_device", type=str) + parser.add_argument("--finetune", type=str) + args = parser.parse_args() + main(args) diff --git a/evaluators/chatglm.py b/evaluators/chatglm.py new file mode 100644 index 0000000..4b62b73 --- /dev/null +++ b/evaluators/chatglm.py @@ -0,0 +1,164 @@ +import os +import re +from tqdm import tqdm +import torch +from transformers import AutoTokenizer, AutoModel, AutoConfig +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList +from evaluators.evaluator import Evaluator + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + +class ChatGLM_Evaluator(Evaluator): + def __init__(self, choices, k, model_name, device, finetune=None): + super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) + # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem + # or directly clone the model + self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna") + if finetune: + CHECKPOINT_PATH="ptuning/" + finetune + config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128) + self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True) + prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin")) + new_prefix_state_dict = {} + for k, v in prefix_state_dict.items(): + if k.startswith("transformer.prefix_encoder."): + new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v + self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict) + self.model = self.model.half().to(device) + self.model.transformer.prefix_encoder.float() + print("Model loaded! use GLM + " + finetune) + else: + self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) + print("Model loaded!(GLM)") + + # self.model = self.model.eval() + + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): + correct_num = 0 + if save_result_dir: + if few_shot: + result = [] + score = [] + if few_shot: + history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) + else: + history = [] + answers = list(test_df['answer']) + for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): + question = self.format_example(row, include_answer=False, cot=cot) + if few_shot: + response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) + response = response.strip() + # For ChatGLM, we use answer extraction in answer-only mode too. + ans, direct_extract = self.extract_cot_answer(row, response) + else: # zero-shot by extracting answer from distribution + ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history) + if ans == answers[row_index]: + correct_num += 1 + correct = 1 + else: + correct = 0 + if save_result_dir: + if few_shot: + result.append(response) + score.append(correct) + correct_ratio = 100*correct_num/len(answers) + + if save_result_dir: + if few_shot: + test_df['model_output'] = result + test_df['correctness'] = score + test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_test.csv')) + + return correct_ratio + + def generate_few_shot_prompt(self, subject, dev_df, cot=False): + message = [] + k = self.k + if self.k == -1: + k = dev_df.shape[0] + message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) + for i in range(1, k): + message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) + return message + + def format_example(self, line, include_answer=True, cot=False, add_prompt=''): + example = add_prompt + line['question'] + # print(example) + for choice in self.choices: + example += f'\n{choice}. {line[f"{choice}"]}' + example += '\n答案:' + if include_answer: + if cot: + ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" + else: + ans = line["answer"] + m = (example, ans) + return m + return example + + def extract_cot_answer(self, line, gen_ans): + m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M) + if len(m) > 0 and m[-1] in self.choices: + return m[-1], True + answer_patterns = [ + r'([ABCD])是正确的', + r'选项([ABCD])正确', + r'答案为([ABCD])', + r'答案是([ABCD])', + r'答案([ABCD])', + r'选择([ABCD])', + r'答案:([ABCD])', + r'选择答案([ABCD])' + ] + # RE extraction + for answer_pattern in answer_patterns: + m = re.search(answer_pattern, gen_ans, re.M) + if m: + answer = m.group(1) + return answer, False + # only containing one choice-character + m = re.findall(r'[ABCD]', gen_ans, re.M) + if len(m) == 1: + answer = m[0] + return answer, False + answer_word_counter = 0 + # only containing one choice-context + for c in self.choices: + if str(line[f'{c}']) in gen_ans: + answer = c + answer_word_counter += 1 + if answer_word_counter == 1: + return answer, False + return '-', False + + def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048, + do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if not history: + prompt = query + else: + prompt = "" + for i, (old_query, response) in enumerate(history): + prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) + prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) + inputs = tokenizer([prompt], return_tensors="pt") + inputs = inputs.to(model.device) + outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs) + + score = outputs.scores[0][0].tolist() + choice_score = [score[167], score[333], score[251], score[416]] + ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)] + return self.choices[ranked_index[0]] diff --git a/evaluators/chatglm2.py b/evaluators/chatglm2.py new file mode 100644 index 0000000..bbc668b --- /dev/null +++ b/evaluators/chatglm2.py @@ -0,0 +1,155 @@ +import os +import re +from tqdm import tqdm +import torch +from transformers import AutoTokenizer, AutoModel, AutoConfig +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList +from evaluators.evaluator import Evaluator +from peft import PeftModel + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + +class ChatGLM_Evaluator(Evaluator): + def __init__(self, choices, k, model_name, device, finetune=None): + super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) + # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem + # or directly clone the model + self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna") + self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) + if finetune: + peft_model_id = "lora/" + finetune + self.model = PeftModel.from_pretrained(self.model, peft_model_id) + print("Model loaded! use GLM2" + finetune) + else: + print("Model loaded!(GLM2)") + # self.model = self.model.eval() + + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): + correct_num = 0 + if save_result_dir: + if few_shot: + result = [] + score = [] + if few_shot: + history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) + else: + history = [] + answers = list(test_df['answer']) + for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): + question = self.format_example(row, include_answer=False, cot=cot) + if few_shot: + response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) + response = response.strip() + # For ChatGLM, we use answer extraction in answer-only mode too. + ans, direct_extract = self.extract_cot_answer(row, response) + else: # zero-shot by extracting answer from distribution + ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history) + if ans == answers[row_index]: + correct_num += 1 + correct = 1 + else: + correct = 0 + if save_result_dir: + if few_shot: + result.append(response) + score.append(correct) + correct_ratio = 100*correct_num/len(answers) + + if save_result_dir: + if few_shot: + test_df['model_output'] = result + test_df['correctness'] = score + test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_{correct_ratio}_test.csv')) + + return correct_ratio + + def generate_few_shot_prompt(self, subject, dev_df, cot=False): + message = [] + k = self.k + if self.k == -1: + k = dev_df.shape[0] + message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) + for i in range(1, k): + message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) + return message + + def format_example(self, line, include_answer=True, cot=False, add_prompt=''): + example = add_prompt + line['question'] + # print(example) + for choice in self.choices: + example += f'\n{choice}. {line[f"{choice}"]}' + example += '\n答案:' + if include_answer: + if cot: + ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" + else: + ans = line["answer"] + m = (example, ans) + return m + return example + + def extract_cot_answer(self, line, gen_ans): + m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M) + if len(m) > 0 and m[-1] in self.choices: + return m[-1], True + answer_patterns = [ + r'([ABCD])是正确的', + r'选项([ABCD])正确', + r'答案为([ABCD])', + r'答案是([ABCD])', + r'答案([ABCD])', + r'选择([ABCD])', + r'答案:([ABCD])', + r'选择答案([ABCD])' + ] + # RE extraction + for answer_pattern in answer_patterns: + m = re.search(answer_pattern, gen_ans, re.M) + if m: + answer = m.group(1) + return answer, False + # only containing one choice-character + m = re.findall(r'[ABCD]', gen_ans, re.M) + if len(m) == 1: + answer = m[0] + return answer, False + answer_word_counter = 0 + # only containing one choice-context + for c in self.choices: + if str(line[f'{c}']) in gen_ans: + answer = c + answer_word_counter += 1 + if answer_word_counter == 1: + return answer, False + return '-', False + + def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048, + do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): + if history is None: + history = [] + if logits_processor is None: + logits_processor = LogitsProcessorList() + logits_processor.append(InvalidScoreLogitsProcessor()) + gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048, + "temperature": temperature, "logits_processor": logits_processor, **kwargs} + if not history: + prompt = query + else: + prompt = "" + for i, (old_query, response) in enumerate(history): + prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) + prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) + inputs = tokenizer([prompt], return_tensors="pt") + inputs = inputs.to(model.device) + outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs) + + score = outputs.scores[0][0].tolist() + choice_score = [score[167], score[333], score[251], score[416]] + ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)] + return self.choices[ranked_index[0]] diff --git a/evaluators/chatgpt.py b/evaluators/chatgpt.py new file mode 100644 index 0000000..fd17d58 --- /dev/null +++ b/evaluators/chatgpt.py @@ -0,0 +1,169 @@ +import os +from tqdm import tqdm +import openai +from evaluators.evaluator import Evaluator +from time import sleep +import re + + +class ChatGPT_Evaluator(Evaluator): + def __init__(self, choices, k, api_key,model_name): + super(ChatGPT_Evaluator, self).__init__(choices, model_name, k) + openai.api_key = api_key + + def format_example(self,line,include_answer=True,cot=False): + example=line['question'] + for choice in self.choices: + example+=f'\n{choice}. {line[f"{choice}"]}' + + example+='\n答案:' + if include_answer: + if cot: + ans=line["answer"] + content="让我们一步一步思考,\n"+line["explanation"]+f"\n所以答案是{ans}。" + return [ + {"role":"user","content":example}, + {"role":"assistant","content":content} + ] + else: + return [ + {"role":"user","content":example}, + {"role":"assistant","content":line["answer"]} + ] + else: + return [ + {"role":"user","content":example}, + ] + def generate_few_shot_prompt(self, subject, dev_df, cot=False): + prompt=[ + { + "role":"system", + "content":f"你是一个中文人工智能助手,以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。" + } + ] + k=self.k + if self.k==-1: + k=dev_df.shape[0] + for i in range(k): + tmp=self.format_example(dev_df.iloc[i,:],include_answer=True,cot=cot) + if i==0: + tmp[0]["content"]=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"+tmp[0]["content"] + prompt+=tmp + return prompt + + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None,cot=False): + correct_num = 0 + if save_result_dir: + result = [] + score=[] + if few_shot: + few_shot_prompt = self.generate_few_shot_prompt(subject_name, dev_df,cot=cot) + else: + few_shot_prompt=[ + { + "role":"system", + "content":f"你是一个中文人工智能助手,以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。" + } + ] + answers = list(test_df['answer']) + for row_index, row in tqdm(test_df.iterrows(),total=len(test_df)): + question = self.format_example(row, include_answer=False) + full_prompt = few_shot_prompt + question + if not few_shot: + full_prompt[-1]["content"]=f"以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n\n"+full_prompt[-1]["content"] + response=None + timeout_counter=0 + while response is None and timeout_counter<=30: + try: + response = openai.ChatCompletion.create( + model=self.model_name, + messages=full_prompt, + temperature=0. + ) + except Exception as msg: + if "timeout=600" in str(msg): + timeout_counter+=1 + print(msg) + sleep(5) + continue + if response==None: + response_str="" + else: + response_str = response['choices'][0]['message']['content'] + #print(response_str) + if cot: + ans_list=re.findall(r"答案是(.+?)。",response_str) + if len(ans_list)==0: + ans_list=re.findall(r"答案为(.+?)。",response_str) + if len(ans_list)==0: + ans_list=re.findall(r"选项(.+?)是正确的。",response_str) + + if len(ans_list)==0: + correct=0 + else: + if self.exact_match(ans_list[-1],row["answer"]): + correct_num+=1 + correct=1 + else: + correct=0 + else: + response_str=response_str.strip() + if few_shot: + if len(response_str)>0: + if self.exact_match(response_str,row["answer"]): + correct_num+=1 + correct=1 + else: + correct=0 + else: + correct=0 + else: + if len(response_str)>0: + ans_list=self.extract_ans(response_str) + if len(ans_list)>0 and (ans_list[-1]==row["answer"]): + correct_num+=1 + correct=1 + else: + correct=0 + else: + correct=0 + if save_result_dir: + result.append(response_str) + score.append(correct) + correct_ratio = 100*correct_num/len(answers) + + if save_result_dir: + test_df['model_output']=result + test_df["correctness"]=score + test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_val.csv'),encoding="utf-8",index=False) + return correct_ratio + + def extract_ans(self,response_str): + pattern=[ + r"^选([A-D])", + r"^选项([A-D])", + r"答案是\s?选?项?\s?([A-D])", + r"答案为\s?选?项?\s?([A-D])", + r"答案应为\s?选?项?\s?([A-D])", + r"答案选\s?选?项?\s?([A-D])", + r"答案是:\s?选?项?\s?([A-D])", + r"答案应该是:\s?选?项?\s?([A-D])", + r"正确的一项是\s?([A-D])", + r"答案为:\s?选?项?\s?([A-D])", + r"答案应为:\s?选?项?\s?([A-D])", + r"答案:\s?选?项?\s?([A-D])", + r"答案是:\s?选?项?\s?([A-D])", + r"答案应该是:\s?选?项?\s?([A-D])", + r"答案为:\s?选?项?\s?([A-D])", + r"答案应为:\s?选?项?\s?([A-D])", + r"答案:\s?选?项?\s?([A-D])", + ] + ans_list=[] + if response_str[0] in ["A",'B','C','D']: + ans_list.append(response_str[0]) + for p in pattern: + if len(ans_list)==0: + ans_list=re.findall(p,response_str) + else: + break + return ans_list \ No newline at end of file diff --git a/evaluators/evaluator.py b/evaluators/evaluator.py new file mode 100644 index 0000000..aadf1df --- /dev/null +++ b/evaluators/evaluator.py @@ -0,0 +1,47 @@ +import re +import string +class Evaluator: + def __init__(self, choices, model_name, k=-1): + self.choices = choices + self.model_name = model_name + self.k = k + self.puncs = list(string.punctuation) + + def format_example(self, line, include_answer=True): + example = line['question'] + # print(example) + for choice in self.choices: + example += f'\n{choice}. {line[f"{choice}"]}' + example += '\n答案:' + if include_answer: + example += f'{line["answer"]}\n\n' + return example + + def generate_few_shot_prompt(self, subject, dev_df): + prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n" + k = self.k + if self.k == -1: + k = dev_df.shape[0] + for i in range(k): + prompt += self.format_example(dev_df.iloc[i, :]) + return prompt + + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None): + pass + + def normalize_answer(self,s): + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude=set(self.puncs) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_punc(lower(s))) + + def exact_match(self,pred, target): + return self.normalize_answer(pred)==self.normalize_answer(target) diff --git a/generate_eval_text.py b/generate_eval_text.py new file mode 100644 index 0000000..91fe2bf --- /dev/null +++ b/generate_eval_text.py @@ -0,0 +1,37 @@ +import pandas as pd + +# 读取CSV文件 +df = pd.read_csv("data/val/car_use_val.csv") +correct_num = 0 +total_num = 0 + +# 遍历每一行并生成选择题文本 +for index, row in df.iterrows(): + question_text = row['question'] + options = [row['A'], row['B'], row['C'], row['D']] + answer = row['answer'] + + # 生成选择题文本 + question_text = f"{question_text}\n" + for i, option in enumerate(options): + question_text += f"{chr(65 + i)}. {option}\n" + + # 打印生成的选择题 + print(f"问题 {index + 1}:") + print(question_text) + user_answer = input("请输入你的答案: ") + df.loc[index, 'user_answer'] = user_answer + print(f"答案: {answer}\n") + total_num += 1 + if user_answer == answer: + print("回答正确!\n") + correct_num += 1 + else: + print("回答错误!\n") + +# 计算正确率 +correct_ratio = 100 * correct_num / total_num +print(f"正确率: {correct_ratio}%") +#结果保存到文件 +df.to_csv("logs/car_use_val_gpt3.5_" + str(correct_ratio) + ".csv") +