You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
9.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from scoring.gpt_scorer import GPTScorer
from scoring.rogue_scorer import get_rouge_score
import pandas as pd
import time
from tqdm import tqdm
class AssessmentEngine:
def __init__(self, save_result_dir, api_key):
self.save_result_dir = save_result_dir
self.gpt_scorer = GPTScorer(api_key)
def eval_subject(self, subject_name, csv_file_name):
qa_result_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
start_time = time.time()
row_count = 0
rouge_score_sum = 0
for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)):
row_count += 1
test_question = row['question']
model_response = row['model_output']
reference_answer = row['answer']
rouge_score = get_rouge_score(model_response, reference_answer)
rouge_1_f_score = rouge_score['rouge-1']['f']
rouge_score_sum += rouge_1_f_score
qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc
qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_score = rouge_score_sum / row_count
qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_'
+ str(synthesis_score) + '.csv', index=False)
def eval_result_diff(self, csv_file_name):
result_diff_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
result_diff_df['rouge_score_finetune'] = 0
result_diff_df['rouge_score_origin'] = 0
result_diff_df['acc_finetune'] = 0
result_diff_df['acc_origin'] = 0
result_diff_df['fluency_finetune'] = 0
result_diff_df['fluency_origin'] = 0
result_diff_df['diff_score'] = 0
result_diff_df['acc_response_finetune'] = 0
result_diff_df['acc_response_origin'] = 0
result_diff_df['fluency_response_finetune'] = 0
result_diff_df['fluency_response_origin'] = 0
result_diff_df['diff_score_response'] = 0
start_time = time.time()
finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0
finetune_acc_score_sum = 0
origin_acc_score_sum = 0
finetune_fluency_score_sum = 0
origin_fluency_score_sum = 0
model_better_score_sum = 0
row_count = 0
for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)):
if row['question'] == '':
continue
row_count += 1
test_question = row['question']
finetune_model_response = row['predict_finetune']
original_model_response = row['predict_origin']
reference_answer = row['answer']
# 计算ROUGE分数
finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer)
finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f']
finetune_rouge_score_sum += finetune_rouge_1_f_score
result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score
origin_rouge_score = get_rouge_score(original_model_response, reference_answer)
origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f']
origin_rouge_score_sum += origin_rouge_1_f_score
result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer))
result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc
result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
finetune_acc_score_sum += float(gpt_score_acc)
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
original_model_response,
reference_answer))
result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc
result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
origin_acc_score_sum += float(gpt_score_acc)
self.gpt_scorer.mode("fluency")
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer))
result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency
result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
finetune_fluency_score_sum += float(gpt_score_fluency)
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
original_model_response,
reference_answer))
result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency
result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
origin_fluency_score_sum += float(gpt_score_fluency)
self.gpt_scorer.mode("diff")
gpt_response_diff, gpt_score_diff = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer,
original_model_response))
result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff
result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff
if (gpt_score_diff is not None) and gpt_score_diff.isdigit():
model_better_score_sum += float(gpt_score_diff)
result_diff_df.to_csv('logs/' + self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False)
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_rouge_score = finetune_rouge_score_sum / row_count
original_rouge_score = origin_rouge_score_sum / row_count
synthesis_acc_score = finetune_acc_score_sum / row_count
original_acc_score = origin_acc_score_sum / row_count
synthesis_fluency_score = finetune_fluency_score_sum / row_count
original_fluency_score = origin_fluency_score_sum / row_count
synthesis_diff_score = model_better_score_sum / row_count
print("微调模型ROUGE分数", synthesis_rouge_score)
print("原模型ROUGE分数", original_rouge_score)
print("微调模型准确性分数:", synthesis_acc_score)
print("原模型准确性分数:", original_acc_score)
print("微调模型流畅度分数:", synthesis_fluency_score)
print("原模型流畅度分数:", original_fluency_score)
print("微调模型优于原模型分数:", synthesis_diff_score)
synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
print("综合评分:", synthesis_score)
original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
original_fluency_score * 100 / 3 + 66) / 4
print("原模型综合评分:", original_synthesis_score)
# 获取当前时间的字符串
current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
result_diff_df.to_csv('logs/' + self.save_result_dir + '/' + current_time + '_result_diff_test_score_'
+ str(synthesis_score) + '.csv', index=False)