diff --git a/.gitignore b/.gitignore index ff4bd03..b157785 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,5 @@ cython_debug/ /ptuning/ /logs/ /data/ +/qlora/ +/metrics/ diff --git a/compare.py b/compare.py new file mode 100644 index 0000000..cc6d540 --- /dev/null +++ b/compare.py @@ -0,0 +1,35 @@ +import pandas as pd + +# 读取两个csv文件 +human_score_df = pd.read_csv('logs/other/human.csv') +machine_score_df = pd.read_csv('logs/other/result_diff_test_score_53.84256314043283.csv') + +result_df = pd.DataFrame(columns=['question', 'answer', 'predict_finetune', 'predict_origin', 'acc_finetune', 'human_acc_finetune', 'acc_origin', 'human_acc_origin', 'fluency_finetune', 'human_fluency_finetune', 'diff_score', 'human_diff_score']) +result_df_row_index = 0 +for row_index, row in machine_score_df.iterrows(): + acc_finetune_diff = row['acc_finetune'] - human_score_df.loc[row_index, '准确度(微调后'] + acc_origin_diff = row['acc_origin'] - human_score_df.loc[row_index, '准确度(微调前'] + fluency_finetune_diff = row['fluency_finetune'] - human_score_df.loc[row_index, '流畅度(微调后'] + diff_score_diff = row['diff_score'] - human_score_df.loc[row_index, '是否超过原模型'] + print("准确度(微调后)差值:", abs(acc_finetune_diff),end=' ') + print("准确度(微调前)差值:", abs(acc_origin_diff),end=' ') + print("流畅度(微调后)差值:", abs(fluency_finetune_diff),end=' ') + print("是否超过原模型差值:", abs(diff_score_diff)) + if abs(acc_finetune_diff) >= 2: + result_df.loc[result_df_row_index, 'question'] = machine_score_df.loc[row_index, 'question'] + result_df.loc[result_df_row_index, 'answer'] = machine_score_df.loc[row_index, 'answer'] + result_df.loc[result_df_row_index, 'predict_finetune'] = machine_score_df.loc[row_index, 'predict_finetune'] + result_df.loc[result_df_row_index, 'predict_origin'] = machine_score_df.loc[row_index, 'predict_origin'] + result_df.loc[result_df_row_index, 'acc_finetune'] = machine_score_df.loc[row_index, 'acc_finetune'] + result_df.loc[result_df_row_index, 'human_acc_finetune'] = human_score_df.loc[row_index, '准确度(微调后'] + result_df.loc[result_df_row_index, 'acc_origin'] = machine_score_df.loc[row_index, 'acc_origin'] + result_df.loc[result_df_row_index, 'human_acc_origin'] = human_score_df.loc[row_index, '准确度(微调前'] + result_df.loc[result_df_row_index, 'fluency_finetune'] = machine_score_df.loc[row_index, 'fluency_finetune'] + result_df.loc[result_df_row_index, 'human_fluency_finetune'] = human_score_df.loc[row_index, '流畅度(微调后'] + result_df.loc[result_df_row_index, 'diff_score'] = machine_score_df.loc[row_index, 'diff_score'] + result_df.loc[result_df_row_index, 'human_diff_score'] = human_score_df.loc[row_index, '是否超过原模型'] + result_df_row_index += 1 + + +result_df.to_csv('logs/other/diff.csv', index=False) +# 信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。 diff --git a/eval.py b/eval.py index e0328f2..87f15e1 100644 --- a/eval.py +++ b/eval.py @@ -5,26 +5,40 @@ import torch from evaluators.chatgpt import ChatGPT_Evaluator from evaluators.chatglm import ChatGLM_Evaluator from evaluators.chatglm2 import ChatGLM_Evaluator as ChatGLM2_Evaluator +from evaluators.chatglm3 import ChatGLM_Evaluator as ChatGLM3_Evaluator import time choices = ["A", "B", "C", "D"] +device = torch.device("cpu") def main(args): + global device + if args.cuda_device: + os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device + device = torch.device("cuda") if "turbo" in args.model_name or "gpt-4" in args.model_name: - # print("Not supported yet") - # return -1 evaluator = ChatGPT_Evaluator( choices=choices, k=args.ntrain, api_key=args.openai_key, model_name=args.model_name ) + elif "chatglm3" in args.model_name: + if args.finetune: + fine_tune_model = args.finetune + else: + fine_tune_model = None + evaluator = ChatGLM3_Evaluator( + choices=choices, + k=args.ntrain, + model_name=args.model_name, + device=device, + finetune=fine_tune_model, + finetune_method=args.finetune_method + ) elif "chatglm2" in args.model_name: - if args.cuda_device: - os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device - device = torch.device("cuda") if args.finetune: fine_tune_model = args.finetune else: @@ -38,9 +52,6 @@ def main(args): finetune_method=args.finetune_method ) elif "chatglm" in args.model_name: - if args.cuda_device: - os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device - device = torch.device("cuda") if args.finetune: fine_tune_model = args.finetune else: @@ -50,7 +61,8 @@ def main(args): k=args.ntrain, model_name=args.model_name, device=device, - finetune=fine_tune_model + finetune=fine_tune_model, + finetune_method=args.finetune_method ) else: print("Unknown model name") @@ -63,33 +75,34 @@ def main(args): fine_tune_model_name = args.finetune else: fine_tune_model_name = 'original' - save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}_{run_date}") + save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}/{run_date}") os.mkdir(save_result_dir) - subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] + # subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] + subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train'] qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] - # qa_subject_list = ['car_use', 'car_market'== + # qa_subject_list = ['car_use', 'car_market'] - # for subject_name in subject_list: - # print("Now testing: " + subject_name) - # # subject_name=args.subject - # val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') - # val_df = pd.read_csv(val_file_path) - # if args.few_shot: - # dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv') - # dev_df = pd.read_csv(dev_file_path) - # correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot, - # save_result_dir=save_result_dir, cot=args.cot) - # else: - # correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot, - # save_result_dir=save_result_dir) - # print("Acc:", correct_ratio) - - for subject_name in qa_subject_list: + for subject_name in subject_list: print("Now testing: " + subject_name) - qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') - qa_df = pd.read_csv(qa_file_path) - evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir) + # subject_name=args.subject + val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') + val_df = pd.read_csv(val_file_path) + if args.few_shot: + dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv') + dev_df = pd.read_csv(dev_file_path) + correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot, + save_result_dir=save_result_dir, cot=args.cot) + else: + correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot, + save_result_dir=save_result_dir) + print("Acc:", correct_ratio) + + # for subject_name in qa_subject_list: + # print("Now testing: " + subject_name) + # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') + # qa_df = pd.read_csv(qa_file_path) + # evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir) if __name__ == "__main__": diff --git a/evaluators/chatglm.py b/evaluators/chatglm.py index b12c88c..2b8ba2e 100644 --- a/evaluators/chatglm.py +++ b/evaluators/chatglm.py @@ -17,13 +17,13 @@ class InvalidScoreLogitsProcessor(LogitsProcessor): class ChatGLM_Evaluator(Evaluator): - def __init__(self, choices, k, model_name, device, finetune=None): + def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None): super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem # or directly clone the model self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna") - if finetune: - CHECKPOINT_PATH = "ptuning/" + finetune + if finetune_method == "ptuning": + CHECKPOINT_PATH = "ptuning/glm1/" + finetune config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128) self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True) prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin")) @@ -38,17 +38,15 @@ class ChatGLM_Evaluator(Evaluator): else: self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) - print("Model loaded!(GLM)") + print("Model loaded! (GLM original)") # self.model = self.model.eval() def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): correct_num = 0 - if save_result_dir: - if few_shot: - result = [] - score = [] - answer_list = [] + result = [] + score = [] + answer_list = [] if few_shot: history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) else: @@ -62,8 +60,13 @@ class ChatGLM_Evaluator(Evaluator): # For ChatGLM, we use answer extraction in answer-only mode too. ans, direct_extract = self.extract_cot_answer(row, response) else: # zero-shot by extracting answer from distribution - ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, - history=history) + response, _ = self.model.chat(self.tokenizer, question, max_length=300, + do_sample=False, history=history) + response = response.strip() + ans, direct_extract = self.extract_cot_answer(row, response) + # print(response, ans) + # ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, + # history=history) if ans == answers[row_index]: correct_num += 1 correct = 1 diff --git a/evaluators/chatglm2.py b/evaluators/chatglm2.py index 783a046..315d936 100644 --- a/evaluators/chatglm2.py +++ b/evaluators/chatglm2.py @@ -26,11 +26,11 @@ class ChatGLM_Evaluator(Evaluator): if finetune_method == "lora": self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) - peft_model_id = "lora/" + finetune + peft_model_id = "lora/glm2/" + finetune self.model = PeftModel.from_pretrained(self.model, peft_model_id) print("Model loaded! use GLM2" + finetune) elif finetune_method == "ptuning": - CHECKPOINT_PATH = "ptuning/" + finetune + CHECKPOINT_PATH = "ptuning/glm2/" + finetune config = AutoConfig.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, pre_seq_len=128) self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", config=config, trust_remote_code=True) prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin")) @@ -55,7 +55,7 @@ class ChatGLM_Evaluator(Evaluator): answer_list = [] if few_shot: history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) - print(history) + # print(history) else: # _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False) history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')] diff --git a/scoring/assessment_engine.py b/scoring/assessment_engine.py index 5c16612..4a845e1 100644 --- a/scoring/assessment_engine.py +++ b/scoring/assessment_engine.py @@ -25,7 +25,7 @@ class AssessmentEngine: rouge_score_sum += rouge_1_f_score qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score self.gpt_scorer.mode("accuracy") - gpt_score_acc, gpt_response_acc = self.gpt_scorer.score_with_chatgpt(test_question, + gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question, model_response, reference_answer) qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc @@ -35,3 +35,115 @@ class AssessmentEngine: synthesis_score = rouge_score_sum / row_count qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_' + str(synthesis_score) + '.csv', index=False) + + def eval_result_diff(self, csv_file_name): + result_diff_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name) + result_diff_df['rouge_score_finetune'] = 0 + result_diff_df['rouge_score_origin'] = 0 + result_diff_df['acc_finetune'] = 0 + result_diff_df['acc_origin'] = 0 + result_diff_df['fluency_finetune'] = 0 + result_diff_df['fluency_origin'] = 0 + result_diff_df['diff_score'] = 0 + result_diff_df['acc_response_finetune'] = 0 + result_diff_df['acc_response_origin'] = 0 + result_diff_df['fluency_response_finetune'] = 0 + result_diff_df['fluency_response_origin'] = 0 + result_diff_df['diff_score_response'] = 0 + start_time = time.time() + finetune_rouge_score_sum = 0 + origin_rouge_score_sum = 0 + finetune_acc_score_sum = 0 + origin_acc_score_sum = 0 + finetune_fluency_score_sum = 0 + origin_fluency_score_sum = 0 + model_better_score_sum = 0 + row_count = 0 + for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)): + if row['question'] == '': + continue + row_count += 1 + test_question = row['question'] + finetune_model_response = row['predict_finetune'] + original_model_response = row['predict_origin'] + reference_answer = row['answer'] + # 计算ROUGE分数 + finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer) + finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f'] + finetune_rouge_score_sum += finetune_rouge_1_f_score + result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score + origin_rouge_score = get_rouge_score(original_model_response, reference_answer) + origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f'] + origin_rouge_score_sum += origin_rouge_1_f_score + result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score + + self.gpt_scorer.mode("accuracy") + gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question, + finetune_model_response, + reference_answer)) + result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc + result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc + if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): + finetune_acc_score_sum += float(gpt_score_acc) + gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question, + original_model_response, + reference_answer)) + result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc + result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc + if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): + origin_acc_score_sum += float(gpt_score_acc) + + self.gpt_scorer.mode("fluency") + gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question, + finetune_model_response, + reference_answer)) + result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency + result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency + if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): + finetune_fluency_score_sum += float(gpt_score_fluency) + gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question, + original_model_response, + reference_answer)) + result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency + result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency + if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): + origin_fluency_score_sum += float(gpt_score_fluency) + + self.gpt_scorer.mode("diff") + gpt_response_diff, gpt_score_diff = (self.gpt_scorer.score_with_chatgpt(test_question, + finetune_model_response, + reference_answer, + original_model_response)) + result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff + result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff + if (gpt_score_diff is not None) and gpt_score_diff.isdigit(): + model_better_score_sum += float(gpt_score_diff) + result_diff_df.to_csv('logs/' + self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False) + + end_time = time.time() + elapsed_time = end_time - start_time + print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") + synthesis_rouge_score = finetune_rouge_score_sum / row_count + original_rouge_score = origin_rouge_score_sum / row_count + synthesis_acc_score = finetune_acc_score_sum / row_count + original_acc_score = origin_acc_score_sum / row_count + synthesis_fluency_score = finetune_fluency_score_sum / row_count + original_fluency_score = origin_fluency_score_sum / row_count + synthesis_diff_score = model_better_score_sum / row_count + print("微调模型ROUGE分数:", synthesis_rouge_score) + print("原模型ROUGE分数:", original_rouge_score) + print("微调模型准确性分数:", synthesis_acc_score) + print("原模型准确性分数:", original_acc_score) + print("微调模型流畅度分数:", synthesis_fluency_score) + print("原模型流畅度分数:", original_fluency_score) + print("微调模型优于原模型分数:", synthesis_diff_score) + synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 + + synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4 + print("综合评分:", synthesis_score) + original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 + + original_fluency_score * 100 / 3 + 66) / 4 + print("原模型综合评分:", original_synthesis_score) + # 获取当前时间的字符串 + current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) + result_diff_df.to_csv('logs/' + self.save_result_dir + '/' + current_time + '_result_diff_test_score_' + + str(synthesis_score) + '.csv', index=False) diff --git a/scoring/gpt_scorer.py b/scoring/gpt_scorer.py index 4e29fe3..703cede 100644 --- a/scoring/gpt_scorer.py +++ b/scoring/gpt_scorer.py @@ -8,11 +8,14 @@ class GPTScorer: self.eval_mode = "accuracy" def mode(self, mode): + # 判断模式是否合法 + if mode not in ["accuracy", "fluency", "diff"]: + raise ValueError("Invalid mode. Must be one of 'accuracy', 'fluency' or 'diff'.") self.eval_mode = mode return self - def score_with_chatgpt(self, question, model_result, reference): - prompt = self.generate_scoring_prompt(question, model_result, reference) + def score_with_chatgpt(self, question, model_result, reference, origin_model_result=None): + prompt = self.generate_scoring_prompt(question, model_result, reference, origin_model_result) try: # 提交文本以获取ChatGPT评分 response = openai.ChatCompletion.create( @@ -25,33 +28,75 @@ class GPTScorer: return chatgpt_response, chatgpt_score except Exception as e: print("An error occurred while scoring with ChatGPT:", e) - return None, None + return None, '2' - def generate_scoring_prompt(self, question, model_result, reference): + def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None): # 生成评分提示 base_prompt = [] if self.eval_mode == "accuracy": + # base_prompt = [{ + # "role": "system", + # "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。" + # "请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为5分。" + # "评分标准为:信息准确无误——5分。信息大致符合实际信息——4分。" + # "信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关或回答语句不完整——1分。" + # "可以根据实际情况稍作调整。" + # "回复格式为:评分为x分。理由:xxx。" + # }] base_prompt = [{ "role": "system", "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。" - "请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为5分。" - "评分标准为:信息准确无误——5分。信息大致符合实际信息——4分。" - "信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关——1分。" - "可以根据实际情况稍作调整。" - "回复格式为:评分为x分。理由:xxx。" + "请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为4分。" + "信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。" + "评分标准为:模型回答正确——4分。模型回答模糊,但部分准确——3分。" + "模型无法给出解答,但明确表示无法解答——2分。模型给出错误或无法理解的回答/模型回答语句不完整——1分。" + "回复格式为:理由:xxx。因此,评分为x分。" }] - prompt = base_prompt + [ - { - "role": "user", - "content": f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}" - } - ] + elif self.eval_mode == "fluency": + base_prompt = [{ + "role": "system", + "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。" + "请从语言流畅度的角度评分大模型生成的结果,以评估其质量。满分为3分。" + "评分标准为:模型回答流畅,符合日常语言习惯——3分。模型回答流畅,但存在突然中断等情况——2分。" + "模型回答无条理,可能重复输出某些单词——1分。" + "回复格式为:理由:xxx。因此,评分为x分。" + }] + elif self.eval_mode == "diff": + base_prompt = [{ + "role": "system", + "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型生成的结果和一个微调后大模型生成结果。" + "请对比这些结果,判断微调后大模型的结果是否优于原模型。满分为3分。" + "信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。" + "对比时请关注结果和参考答案的契合度。" + "评分标准为:认为回答优于原模型——3分。认为回答与原模型持平——2分。" + "认为回答不如原模型——1分。" + "回复格式为:理由:xxx。因此,评分为x分。" + }] + if self.eval_mode == "diff": + if origin_model_result is None: + raise ValueError("The original model result is required in 'diff' mode.") + prompt = base_prompt + [ + { + "role": "user", + "content": f"问题:{question}\n\n原模型生成的结果:{origin_model_result}\n\n" + f"微调后模型生成的结果:{model_result}\n\n参考答案:{reference}" + } + ] + else: + prompt = base_prompt + [ + { + "role": "user", + "content": f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}" + } + ] return prompt + # AIzaSyAW_h8itGLwNhYTfx1EDLthhcHHlcIfs7w (google) def extract_score(self, response_text): # 提取评分 pattern = [ r"^评分为([1-5])分", + r"评分:([1-5])分", ] score_list = [] for p in pattern: @@ -59,6 +104,8 @@ class GPTScorer: score_list = re.findall(p, response_text) else: break + if len(score_list) == 0: + return '3' return score_list[0] diff --git a/scoring/ppl_scorer.py b/scoring/ppl_scorer.py new file mode 100644 index 0000000..7111e50 --- /dev/null +++ b/scoring/ppl_scorer.py @@ -0,0 +1,10 @@ +import evaluate + +perplexity = evaluate.load("../metrics/perplexity") +input_texts = ["你好!", "打死哦对吉萨大你去我家而且我就", "这辆车非常优秀"] +results = perplexity.compute(model_id='gpt2', + add_start_token=False, + predictions=input_texts) +print(list(results.keys())) +print(results["perplexities"]) + diff --git a/test.py b/test.py index 02b2137..6b5c489 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,28 @@ from scoring.assessment_engine import AssessmentEngine -assessment_engine = AssessmentEngine("chatglm2_glm2_pt1_2024-03-08_11-24-47", +assessment_engine = AssessmentEngine("other", "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") -assessment_engine.eval_subject("car_knowledge", "car_knowledge_qa_test_result.csv") -assessment_engine.eval_subject("car_use", "car_use_qa_test_result.csv") -assessment_engine.eval_subject("car_market", "car_market_qa_test_result.csv") +assessment_engine.eval_result_diff("0408output-dora.csv") + +# synthesis_rouge_score = 0.30358589506467687 +# print("微调模型ROUGE分数:", synthesis_rouge_score) +# original_rouge_score = 0.26004000118452175 +# print("原模型ROUGE分数:", original_rouge_score) +# synthesis_acc_score = 2.768 +# print("微调模型准确性分数:", synthesis_acc_score) +# original_acc_score = 2.724 +# print("原模型准确性分数:", original_acc_score) +# synthesis_fluency_score = 2.098 +# print("微调模型流畅度分数:", synthesis_fluency_score) +# original_fluency_score = 2.236 +# print("原模型流畅度分数:", original_fluency_score) +# synthesis_diff_score = 2.278 +# print("微调模型优于原模型分数:", synthesis_diff_score) +# +# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100/4 + synthesis_fluency_score * 100/3 +# + synthesis_diff_score * 100/3 ) / 4 +# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100/4 + +# original_fluency_score * 100/3 + 66 ) / 4 +# +# print("综合评分:", synthesis_score) +# print("原模型综合评分:", original_synthesis_score)