diff --git a/README.md b/README.md index 3a3686e..83714a4 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ A simple program to evaluate large language model. - transformers 4.33.2 - accelerate 0.26.1 - tqdm 4.66.1 -- openai 1.10.0 +- openai 0.28 ## 需求其余文件 @@ -18,6 +18,7 @@ A simple program to evaluate large language model. - 微调后的lora模型可放置于`./lora`文件夹下,可应用于ChatGLM2 - 微调后的ptuning模型可放置于`./ptuning`文件夹下,可应用于ChatGLM - 训练数据按照C-Eval格式,放置于`./data`文件夹下,文件命名和`eval.py`中的`subject_name`相关 +- 相较于C-Eval的数据集,代码添加了'qa'的数据集,放置于`./data/qa`文件夹下,为非选择题的问答数据集。 ## Run diff --git a/eval.py b/eval.py index edf7a7d..9a8f2a8 100644 --- a/eval.py +++ b/eval.py @@ -15,7 +15,7 @@ def main(args): if "turbo" in args.model_name or "gpt-4" in args.model_name: # print("Not supported yet") # return -1 - evaluator=ChatGPT_Evaluator( + evaluator = ChatGPT_Evaluator( choices=choices, k=args.ntrain, api_key=args.openai_key, @@ -66,9 +66,10 @@ def main(args): os.mkdir(save_result_dir) subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] + qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] for subject_name in subject_list: - print(subject_name) + print("Now testing: " + subject_name) # subject_name=args.subject val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') val_df = pd.read_csv(val_file_path) @@ -82,6 +83,12 @@ def main(args): save_result_dir=save_result_dir) print("Acc:", correct_ratio) + for subject_name in qa_subject_list: + print("Now testing: " + subject_name) + qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') + qa_df = pd.read_csv(qa_file_path) + evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir) + if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/evaluators/chatglm.py b/evaluators/chatglm.py index 5196b9a..b12c88c 100644 --- a/evaluators/chatglm.py +++ b/evaluators/chatglm.py @@ -7,6 +7,7 @@ from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList from evaluators.evaluator import Evaluator + class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): @@ -14,6 +15,7 @@ class InvalidScoreLogitsProcessor(LogitsProcessor): scores[..., 5] = 5e4 return scores + class ChatGLM_Evaluator(Evaluator): def __init__(self, choices, k, model_name, device, finetune=None): super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) @@ -21,7 +23,7 @@ class ChatGLM_Evaluator(Evaluator): # or directly clone the model self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna") if finetune: - CHECKPOINT_PATH="ptuning/" + finetune + CHECKPOINT_PATH = "ptuning/" + finetune config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128) self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True) prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin")) @@ -34,7 +36,8 @@ class ChatGLM_Evaluator(Evaluator): self.model.transformer.prefix_encoder.float() print("Model loaded! use GLM + " + finetune) else: - self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) + self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", + resume_download=True).half().to(device) print("Model loaded!(GLM)") # self.model = self.model.eval() @@ -45,6 +48,7 @@ class ChatGLM_Evaluator(Evaluator): if few_shot: result = [] score = [] + answer_list = [] if few_shot: history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) else: @@ -57,8 +61,9 @@ class ChatGLM_Evaluator(Evaluator): response = response.strip() # For ChatGLM, we use answer extraction in answer-only mode too. ans, direct_extract = self.extract_cot_answer(row, response) - else: # zero-shot by extracting answer from distribution - ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history) + else: # zero-shot by extracting answer from distribution + ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, + history=history) if ans == answers[row_index]: correct_num += 1 correct = 1 @@ -68,12 +73,14 @@ class ChatGLM_Evaluator(Evaluator): if few_shot: result.append(response) score.append(correct) - correct_ratio = 100*correct_num/len(answers) + answer_list.append(ans) + correct_ratio = 100 * correct_num / len(answers) if save_result_dir: if few_shot: test_df['model_output'] = result test_df['correctness'] = score + test_df['model_answer'] = answer_list result_file_name = f'{subject_name}_{correct_ratio}_test.csv' if few_shot: result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv' @@ -81,12 +88,24 @@ class ChatGLM_Evaluator(Evaluator): return correct_ratio + def eval_qa(self, subject_name, qa_df, save_result_dir=None): + history = [] + for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)): + question = row['question'] + response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) + response = response.strip() + qa_df.loc[row_index, 'model_output'] = response + if save_result_dir: + result_file_name = f'{subject_name}_qa_test_result.csv' + qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) + def generate_few_shot_prompt(self, subject, dev_df, cot=False): message = [] k = self.k if self.k == -1: k = dev_df.shape[0] - message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) + message.append(self.format_example(dev_df.iloc[0, :], cot=cot, + add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) for i in range(1, k): message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) return message @@ -164,5 +183,6 @@ class ChatGLM_Evaluator(Evaluator): score = outputs.scores[0][0].tolist() choice_score = [score[167], score[333], score[251], score[416]] - ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)] + ranked_index = [index for index, value in + sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)] return self.choices[ranked_index[0]] diff --git a/evaluators/chatglm2.py b/evaluators/chatglm2.py index 74c0a26..03864bd 100644 --- a/evaluators/chatglm2.py +++ b/evaluators/chatglm2.py @@ -36,6 +36,7 @@ class ChatGLM_Evaluator(Evaluator): if few_shot: result = [] score = [] + answer_list = [] if few_shot: history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) else: @@ -58,6 +59,7 @@ class ChatGLM_Evaluator(Evaluator): if save_result_dir: if few_shot: result.append(response) + answer_list.append(ans) score.append(correct) correct_ratio = 100*correct_num/len(answers) @@ -65,6 +67,7 @@ class ChatGLM_Evaluator(Evaluator): if few_shot: test_df['model_output'] = result test_df['correctness'] = score + test_df['model_answer'] = answer_list result_file_name = f'{subject_name}_{correct_ratio}_test.csv' if few_shot: result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv' diff --git a/evaluators/evaluator.py b/evaluators/evaluator.py index aadf1df..46a6dc7 100644 --- a/evaluators/evaluator.py +++ b/evaluators/evaluator.py @@ -25,10 +25,13 @@ class Evaluator: for i in range(k): prompt += self.format_example(dev_df.iloc[i, :]) return prompt - + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None): pass + def eval_qa(self, subject_name, qa_df, save_result_dir=None): + pass + def normalize_answer(self,s): def white_space_fix(text): diff --git a/scoring/gpt_scoring.py b/scoring/gpt_scoring.py new file mode 100644 index 0000000..305e9bf --- /dev/null +++ b/scoring/gpt_scoring.py @@ -0,0 +1,77 @@ +import openai +import re + +class ModelScorer: + def __init__(self, api_key): + openai.api_key = api_key + self.eval_mode = "accuracy" + + def score_with_chatgpt(self, text): + try: + # 提交文本以获取ChatGPT评分 + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=text, + ) + # 提取评分 + chatgpt_response = response.choices[0]['message']['content'] + chatgpt_score = self.extract_score(chatgpt_response) + return chatgpt_response,chatgpt_score + except Exception as e: + print("An error occurred while scoring with ChatGPT:", e) + return None + + def generate_scoring_prompt(self, question, model_result, reference): + # 生成评分提示 + base_prompt = [] + if self.eval_mode == "accuracy": + base_prompt = [{ + "role": "system", + "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。" + "请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为5分。" + "评分标准为:信息准确无误——5分。信息大致符合实际信息——4分。" + "信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关——1分。" + "可以根据实际情况稍作调整。" + "回复格式为:评分为x分。理由:xxx。" + }] + prompt = base_prompt + [ + { + "role": "user", + "content": f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}" + } + ] + return prompt + + def extract_score(self, response_text): + # 提取评分 + pattern=[ + r"^评分为([1-5])分", + ] + score_list=[] + for p in pattern: + if len(score_list)==0: + score_list=re.findall(p,response_text) + else: + break + return score_list[0] + + +# 示例用法 +if __name__ == "__main__": + my_api_key = "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" + + # 初始化模型评分器 + scorer = ModelScorer(my_api_key) + + # 要评分的大模型结果 + question = "秦Plus-DMi车型的安全气囊有哪些类型?" + model_result = "截止到我最后更新知识的时候,关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。" + reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。" + + prompt = scorer.generate_scoring_prompt(question, model_result, reference) + # 获取ChatGPT评分 + response,score = scorer.score_with_chatgpt(prompt) + if response is not None: + print("ChatGPT评分:", score, "\nChatGPT回复:", response) + else: + print("无法获取ChatGPT评分。") diff --git a/scoring/rogue.py b/scoring/rogue.py index e3e2c61..0655323 100644 --- a/scoring/rogue.py +++ b/scoring/rogue.py @@ -14,4 +14,4 @@ def get_rouge_score(s1, s2): if __name__ == "__main__": print('hello') - print(get_rouge_score("I love you", "I like you")) + print(get_rouge_score("比亚迪秦PLUS-DMi是一款混合动力汽车。", "比亚迪秦PLUS-DMi是一款混动车。"))