from scoring.gpt_scorer import GPTScorer from scoring.rogue_scorer import get_rouge_score import pandas as pd import time from tqdm import tqdm class AssessmentEngine: def __init__(self, save_result_dir, api_key): self.save_result_dir = save_result_dir self.gpt_scorer = GPTScorer(api_key) def eval_subject(self, subject_name, csv_file_name): qa_result_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name) start_time = time.time() row_count = 0 rouge_score_sum = 0 for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)): row_count += 1 test_question = row['question'] model_response = row['model_output'] reference_answer = row['answer'] rouge_score = get_rouge_score(model_response, reference_answer) rouge_1_f_score = rouge_score['rouge-1']['f'] rouge_score_sum += rouge_1_f_score qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score self.gpt_scorer.mode("accuracy") gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question, model_response, reference_answer) qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc end_time = time.time() elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") synthesis_score = rouge_score_sum / row_count qa_result_df.to_csv(self.save_result_dir + '/' + subject_name + '_qa_test_score_' + str(synthesis_score) + '.csv', index=False) def eval_result_diff(self, csv_file_name, file_type='csv'): if file_type == 'json': result_diff_df = pd.read_json(self.save_result_dir + '/' + csv_file_name) elif file_type == 'csv': result_diff_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name) else: print("Unknown file type:" + file_type) return result_diff_df['rouge_score_finetune'] = '0' result_diff_df['rouge_score_origin'] = '0' result_diff_df['acc_finetune'] = '0' result_diff_df['acc_origin'] = '0' result_diff_df['fluency_finetune'] = '0' result_diff_df['fluency_origin'] = '0' result_diff_df['diff_score'] = '0' result_diff_df['acc_response_finetune'] = '0' result_diff_df['acc_response_origin'] = '0' result_diff_df['fluency_response_finetune'] = '0' result_diff_df['fluency_response_origin'] = '0' result_diff_df['diff_score_response'] = '0' start_time = time.time() finetune_rouge_score_sum = 0 origin_rouge_score_sum = 0 finetune_acc_score_sum = 0 origin_acc_score_sum = 0 finetune_fluency_score_sum = 0 origin_fluency_score_sum = 0 model_better_score_sum = 0 row_count = 0 for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)): if row['question'] == '': continue row_count += 1 test_question = row['question'] finetune_model_response = row['predict_finetune'] original_model_response = row['predict_origin'] reference_answer = row['answer'] # 计算ROUGE分数 finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer) finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f'] finetune_rouge_score_sum += finetune_rouge_1_f_score result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score origin_rouge_score = get_rouge_score(original_model_response, reference_answer) origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f'] origin_rouge_score_sum += origin_rouge_1_f_score result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score self.gpt_scorer.mode("accuracy") gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question, finetune_model_response, reference_answer)) result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): finetune_acc_score_sum += float(gpt_score_acc) gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question, original_model_response, reference_answer)) result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): origin_acc_score_sum += float(gpt_score_acc) self.gpt_scorer.mode("fluency") gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question, finetune_model_response, reference_answer)) result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): finetune_fluency_score_sum += float(gpt_score_fluency) gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question, original_model_response, reference_answer)) result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): origin_fluency_score_sum += float(gpt_score_fluency) self.gpt_scorer.mode("diff") gpt_response_diff, gpt_score_diff = (self.gpt_scorer.score_with_chatgpt(test_question, finetune_model_response, reference_answer, original_model_response)) result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff if (gpt_score_diff is not None) and gpt_score_diff.isdigit(): model_better_score_sum += float(gpt_score_diff) result_diff_df.to_csv(self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False) end_time = time.time() elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") synthesis_rouge_score = finetune_rouge_score_sum / row_count original_rouge_score = origin_rouge_score_sum / row_count synthesis_acc_score = finetune_acc_score_sum / row_count original_acc_score = origin_acc_score_sum / row_count synthesis_fluency_score = finetune_fluency_score_sum / row_count original_fluency_score = origin_fluency_score_sum / row_count synthesis_diff_score = model_better_score_sum / row_count print("微调模型ROUGE分数:", synthesis_rouge_score) print("原模型ROUGE分数:", original_rouge_score) print("微调模型准确性分数:", synthesis_acc_score) print("原模型准确性分数:", original_acc_score) print("微调模型流畅度分数:", synthesis_fluency_score) print("原模型流畅度分数:", original_fluency_score) print("微调模型优于原模型分数:", synthesis_diff_score) synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 + synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4 print("综合评分:", synthesis_score) original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 + original_fluency_score * 100 / 3 + 66) / 4 print("原模型综合评分:", original_synthesis_score) # 获取当前时间的字符串 current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) result_diff_df.to_csv(self.save_result_dir + '/' + current_time + '_result_diff_test_score_' + str(synthesis_score) + '.csv', index=False) def eval_result(self, file_name, file_type='csv'): start_time = time.time() if file_type == 'json': result_df = pd.read_json(self.save_result_dir + '/' + file_name) elif file_type == 'csv': result_df = pd.read_csv(self.save_result_dir + '/' + file_name) else: print("Unsupported file type:" + file_type) return result_df['rouge_score_finetune'] = '0' result_df['acc_finetune'] = '0' result_df['fluency_finetune'] = '0' result_df['acc_response_finetune'] = '0' result_df['fluency_response_finetune'] = '0' rouge_score_sum = 0 acc_score_sum = 0 fluency_score_sum = 0 row_count = 0 for row_index, row in tqdm(result_df.iterrows(), total=len(result_df)): row_count += 1 test_question = row['question'] model_response = row['Predict'] reference_answer = row['answer'] rouge_score = get_rouge_score(model_response, reference_answer) rouge_1_f_score = rouge_score['rouge-1']['f'] rouge_score_sum += rouge_1_f_score result_df.loc[row_index, 'rouge_score_finetune'] = rouge_1_f_score self.gpt_scorer.mode("accuracy") gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question, model_response, reference_answer) result_df.loc[row_index, 'acc_finetune'] = gpt_score_acc result_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): acc_score_sum += float(gpt_score_acc) self.gpt_scorer.mode("fluency") gpt_response_fluency, gpt_score_fluency = self.gpt_scorer.score_with_chatgpt(test_question, model_response, reference_answer) result_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency result_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): fluency_score_sum += float(gpt_score_fluency) result_df.to_csv(self.save_result_dir + '/result_test_score_tmp.csv', index=False) end_time = time.time() elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") rouge_score = rouge_score_sum / row_count acc_score = acc_score_sum / row_count fluency_score = fluency_score_sum / row_count print("ROUGE分数:", rouge_score) print("准确性分数:", acc_score) print("流畅度分数:", fluency_score) synthesis_score = (rouge_score * 100 + acc_score * 100 / 4 + fluency_score * 100 / 3 + 66) / 4 print("综合评分:", synthesis_score) result_df.to_csv(self.save_result_dir + f'/result_test_score_{synthesis_score}.csv', index=False)