from scoring.gpt_scorer import GPTScorer from scoring.gemini_scorer import GeminiScorer from scoring.dash_scope_scorer import DashScopeScorer from scoring.rogue_scorer import get_rouge_score import pandas as pd import time from tqdm import tqdm class AssessmentEngine: def __init__(self, save_result_dir, api_key, eval_engine='gpt'): self.save_result_dir = save_result_dir self.eval_engine = eval_engine if eval_engine == 'gpt': self.llm_scorer = GPTScorer(api_key) elif eval_engine == 'gemini': self.llm_scorer = GeminiScorer(api_key) elif eval_engine == 'dashscope': self.llm_scorer = DashScopeScorer(api_key) else: raise ValueError("Unsupported evaluation engine:" + eval_engine) def eval_subject(self, subject_name, csv_file_name): qa_result_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name) start_time = time.time() row_count = 0 rouge_score_sum = 0 for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)): row_count += 1 test_question = row['question'] model_response = row['model_output'] reference_answer = row['answer'] rouge_score = get_rouge_score(model_response, reference_answer) rouge_1_f_score = rouge_score['rouge-1']['f'] rouge_score_sum += rouge_1_f_score qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score self.llm_scorer.mode("accuracy") gpt_response_acc, gpt_score_acc = self.llm_scorer.score_with_llm(test_question, model_response, reference_answer) qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc end_time = time.time() elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") synthesis_score = rouge_score_sum / row_count qa_result_df.to_csv(self.save_result_dir + '/' + subject_name + '_qa_test_score_' + str(synthesis_score) + '.csv', index=False) def eval_result_diff(self, csv_file_name, file_type='csv'): if file_type == 'json': result_diff_df = pd.read_json(self.save_result_dir + '/' + csv_file_name) elif file_type == 'csv': result_diff_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name) else: print("Unknown file type:" + file_type) return result_diff_df['rouge_score_finetune'] = '0' result_diff_df['rouge_score_origin'] = '0' result_diff_df['acc_finetune'] = '0' result_diff_df['acc_origin'] = '0' result_diff_df['fluency_finetune'] = '0' result_diff_df['fluency_origin'] = '0' result_diff_df['diff_score'] = '0' result_diff_df['acc_response_finetune'] = '0' result_diff_df['acc_response_origin'] = '0' result_diff_df['fluency_response_finetune'] = '0' result_diff_df['fluency_response_origin'] = '0' result_diff_df['diff_score_response'] = '0' start_time = time.time() finetune_rouge_score_sum = 0 origin_rouge_score_sum = 0 finetune_acc_score_sum = 0 origin_acc_score_sum = 0 finetune_fluency_score_sum = 0 origin_fluency_score_sum = 0 model_better_score_sum = 0 row_count = 0 for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)): if row['question'] == '': continue row_count += 1 test_question = row['question'] finetune_model_response = row['predict_finetune'] original_model_response = row['predict_origin'] reference_answer = row['answer'] # 计算ROUGE分数 finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer) finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f'] finetune_rouge_score_sum += finetune_rouge_1_f_score result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score origin_rouge_score = get_rouge_score(original_model_response, reference_answer) origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f'] origin_rouge_score_sum += origin_rouge_1_f_score result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score self.llm_scorer.mode("accuracy") gpt_response_acc, gpt_score_acc = (self.llm_scorer.score_with_llm(test_question, finetune_model_response, reference_answer)) result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): finetune_acc_score_sum += float(gpt_score_acc) gpt_response_acc, gpt_score_acc = (self.llm_scorer.score_with_llm(test_question, original_model_response, reference_answer)) result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): origin_acc_score_sum += float(gpt_score_acc) self.llm_scorer.mode("fluency") gpt_response_fluency, gpt_score_fluency = (self.llm_scorer.score_with_llm(test_question, finetune_model_response, reference_answer)) result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): finetune_fluency_score_sum += float(gpt_score_fluency) gpt_response_fluency, gpt_score_fluency = (self.llm_scorer.score_with_llm(test_question, original_model_response, reference_answer)) result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): origin_fluency_score_sum += float(gpt_score_fluency) self.llm_scorer.mode("diff") gpt_response_diff, gpt_score_diff = (self.llm_scorer.score_with_llm(test_question, finetune_model_response, reference_answer, original_model_response)) result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff if (gpt_score_diff is not None) and gpt_score_diff.isdigit(): model_better_score_sum += float(gpt_score_diff) result_diff_df.to_csv(self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False) end_time = time.time() elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") synthesis_rouge_score = finetune_rouge_score_sum / row_count original_rouge_score = origin_rouge_score_sum / row_count synthesis_acc_score = finetune_acc_score_sum / row_count original_acc_score = origin_acc_score_sum / row_count finetune_fluency_score = finetune_fluency_score_sum / row_count original_fluency_score = origin_fluency_score_sum / row_count synthesis_diff_score = model_better_score_sum / row_count print("微调模型ROUGE分数:", synthesis_rouge_score) print("原模型ROUGE分数:", original_rouge_score) print("微调模型准确性分数:", synthesis_acc_score) print("原模型准确性分数:", original_acc_score) print("微调模型流畅度分数:", finetune_fluency_score) print("原模型流畅度分数:", original_fluency_score) print("微调模型优于原模型分数:", synthesis_diff_score) synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 + finetune_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4 print("综合评分:", synthesis_score) original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 + original_fluency_score * 100 / 3 + 66) / 4 print("原模型综合评分:", original_synthesis_score) # 获取当前时间的字符串 current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) result_diff_df.to_csv(self.save_result_dir + '/' + current_time + '_result_diff_test_score_' + str(synthesis_score) + '.csv', index=False) history_result_df = pd.read_csv('logs/llm_result.csv', encoding='utf-8') score_data = { '评分模型': self.eval_engine, '评分文件': csv_file_name, '微调模型ROUGE分数': synthesis_rouge_score, '原模型ROUGE分数': original_rouge_score, '微调模型准确性分数': synthesis_acc_score, '原模型准确性分数': original_acc_score, '微调模型流畅度分数': finetune_fluency_score, '原模型流畅度分数': original_fluency_score, '微调模型优于原模型分数': synthesis_diff_score, '综合评分': synthesis_score, '原模型综合评分': original_synthesis_score } history_result_df = pd.concat([history_result_df, pd.DataFrame(score_data, index=[0])], ignore_index=True) history_result_df.to_csv('logs/llm_result.csv', index=False, encoding='utf-8') def eval_result(self, file_name, file_type='csv'): start_time = time.time() if file_type == 'json': result_df = pd.read_json(self.save_result_dir + '/' + file_name) elif file_type == 'csv': result_df = pd.read_csv(self.save_result_dir + '/' + file_name) else: print("Unsupported file type:" + file_type) return result_df['rouge_score_finetune'] = '0' result_df['acc_finetune'] = '0' result_df['fluency_finetune'] = '0' result_df['acc_response_finetune'] = '0' result_df['fluency_response_finetune'] = '0' rouge_score_sum = 0 acc_score_sum = 0 fluency_score_sum = 0 row_count = 0 for row_index, row in tqdm(result_df.iterrows(), total=len(result_df)): row_count += 1 test_question = row['question'] model_response = row['Predict'] reference_answer = row['answer'] rouge_score = get_rouge_score(model_response, reference_answer) rouge_1_f_score = rouge_score['rouge-1']['f'] rouge_score_sum += rouge_1_f_score result_df.loc[row_index, 'rouge_score_finetune'] = rouge_1_f_score self.llm_scorer.mode("accuracy") gpt_response_acc, gpt_score_acc = self.llm_scorer.score_with_llm(test_question, model_response, reference_answer) result_df.loc[row_index, 'acc_finetune'] = gpt_score_acc result_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): acc_score_sum += float(gpt_score_acc) self.llm_scorer.mode("fluency") gpt_response_fluency, gpt_score_fluency = self.llm_scorer.score_with_llm(test_question, model_response, reference_answer) result_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency result_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): fluency_score_sum += float(gpt_score_fluency) result_df.to_csv(self.save_result_dir + '/result_test_score_tmp.csv', index=False) end_time = time.time() elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") rouge_score = rouge_score_sum / row_count acc_score = acc_score_sum / row_count fluency_score = fluency_score_sum / row_count print("ROUGE分数:", rouge_score) print("准确性分数:", acc_score) print("流畅度分数:", fluency_score) synthesis_score = (rouge_score * 100 + acc_score * 100 / 4 + fluency_score * 100 / 3 + 66) / 4 print("综合评分:", synthesis_score) result_df.to_csv(self.save_result_dir + f'/result_test_score_{synthesis_score}.csv', index=False)