|
|
from scoring.gpt_scorer import GPTScorer
|
|
|
from scoring.rogue_scorer import get_rouge_score
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
class AssessmentEngine:
|
|
|
def __init__(self, save_result_dir, api_key):
|
|
|
self.save_result_dir = save_result_dir
|
|
|
self.gpt_scorer = GPTScorer(api_key)
|
|
|
|
|
|
def eval_subject(self, subject_name, csv_file_name):
|
|
|
qa_result_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
|
|
|
start_time = time.time()
|
|
|
row_count = 0
|
|
|
rouge_score_sum = 0
|
|
|
for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)):
|
|
|
row_count += 1
|
|
|
test_question = row['question']
|
|
|
model_response = row['model_output']
|
|
|
reference_answer = row['answer']
|
|
|
rouge_score = get_rouge_score(model_response, reference_answer)
|
|
|
rouge_1_f_score = rouge_score['rouge-1']['f']
|
|
|
rouge_score_sum += rouge_1_f_score
|
|
|
qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score
|
|
|
self.gpt_scorer.mode("accuracy")
|
|
|
gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question,
|
|
|
model_response, reference_answer)
|
|
|
qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc
|
|
|
qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc
|
|
|
end_time = time.time()
|
|
|
elapsed_time = end_time - start_time
|
|
|
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒")
|
|
|
synthesis_score = rouge_score_sum / row_count
|
|
|
qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_'
|
|
|
+ str(synthesis_score) + '.csv', index=False)
|
|
|
|
|
|
def eval_result_diff(self, csv_file_name):
|
|
|
result_diff_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
|
|
|
result_diff_df['rouge_score_finetune'] = 0
|
|
|
result_diff_df['rouge_score_origin'] = 0
|
|
|
result_diff_df['acc_finetune'] = 0
|
|
|
result_diff_df['acc_origin'] = 0
|
|
|
result_diff_df['fluency_finetune'] = 0
|
|
|
result_diff_df['fluency_origin'] = 0
|
|
|
result_diff_df['diff_score'] = 0
|
|
|
result_diff_df['acc_response_finetune'] = 0
|
|
|
result_diff_df['acc_response_origin'] = 0
|
|
|
result_diff_df['fluency_response_finetune'] = 0
|
|
|
result_diff_df['fluency_response_origin'] = 0
|
|
|
result_diff_df['diff_score_response'] = 0
|
|
|
start_time = time.time()
|
|
|
finetune_rouge_score_sum = 0
|
|
|
origin_rouge_score_sum = 0
|
|
|
finetune_acc_score_sum = 0
|
|
|
origin_acc_score_sum = 0
|
|
|
finetune_fluency_score_sum = 0
|
|
|
origin_fluency_score_sum = 0
|
|
|
model_better_score_sum = 0
|
|
|
row_count = 0
|
|
|
for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)):
|
|
|
if row['question'] == '':
|
|
|
continue
|
|
|
row_count += 1
|
|
|
test_question = row['question']
|
|
|
finetune_model_response = row['predict_finetune']
|
|
|
original_model_response = row['predict_origin']
|
|
|
reference_answer = row['answer']
|
|
|
# 计算ROUGE分数
|
|
|
finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer)
|
|
|
finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f']
|
|
|
finetune_rouge_score_sum += finetune_rouge_1_f_score
|
|
|
result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score
|
|
|
origin_rouge_score = get_rouge_score(original_model_response, reference_answer)
|
|
|
origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f']
|
|
|
origin_rouge_score_sum += origin_rouge_1_f_score
|
|
|
result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score
|
|
|
|
|
|
self.gpt_scorer.mode("accuracy")
|
|
|
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
|
|
|
finetune_model_response,
|
|
|
reference_answer))
|
|
|
result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc
|
|
|
result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc
|
|
|
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
|
|
|
finetune_acc_score_sum += float(gpt_score_acc)
|
|
|
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
|
|
|
original_model_response,
|
|
|
reference_answer))
|
|
|
result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc
|
|
|
result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc
|
|
|
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
|
|
|
origin_acc_score_sum += float(gpt_score_acc)
|
|
|
|
|
|
self.gpt_scorer.mode("fluency")
|
|
|
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
|
|
|
finetune_model_response,
|
|
|
reference_answer))
|
|
|
result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency
|
|
|
result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency
|
|
|
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
|
|
|
finetune_fluency_score_sum += float(gpt_score_fluency)
|
|
|
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
|
|
|
original_model_response,
|
|
|
reference_answer))
|
|
|
result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency
|
|
|
result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency
|
|
|
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
|
|
|
origin_fluency_score_sum += float(gpt_score_fluency)
|
|
|
|
|
|
self.gpt_scorer.mode("diff")
|
|
|
gpt_response_diff, gpt_score_diff = (self.gpt_scorer.score_with_chatgpt(test_question,
|
|
|
finetune_model_response,
|
|
|
reference_answer,
|
|
|
original_model_response))
|
|
|
result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff
|
|
|
result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff
|
|
|
if (gpt_score_diff is not None) and gpt_score_diff.isdigit():
|
|
|
model_better_score_sum += float(gpt_score_diff)
|
|
|
result_diff_df.to_csv('logs/' + self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False)
|
|
|
|
|
|
end_time = time.time()
|
|
|
elapsed_time = end_time - start_time
|
|
|
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒")
|
|
|
synthesis_rouge_score = finetune_rouge_score_sum / row_count
|
|
|
original_rouge_score = origin_rouge_score_sum / row_count
|
|
|
synthesis_acc_score = finetune_acc_score_sum / row_count
|
|
|
original_acc_score = origin_acc_score_sum / row_count
|
|
|
synthesis_fluency_score = finetune_fluency_score_sum / row_count
|
|
|
original_fluency_score = origin_fluency_score_sum / row_count
|
|
|
synthesis_diff_score = model_better_score_sum / row_count
|
|
|
print("微调模型ROUGE分数:", synthesis_rouge_score)
|
|
|
print("原模型ROUGE分数:", original_rouge_score)
|
|
|
print("微调模型准确性分数:", synthesis_acc_score)
|
|
|
print("原模型准确性分数:", original_acc_score)
|
|
|
print("微调模型流畅度分数:", synthesis_fluency_score)
|
|
|
print("原模型流畅度分数:", original_fluency_score)
|
|
|
print("微调模型优于原模型分数:", synthesis_diff_score)
|
|
|
synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
|
|
|
synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
|
|
|
print("综合评分:", synthesis_score)
|
|
|
original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
|
|
|
original_fluency_score * 100 / 3 + 66) / 4
|
|
|
print("原模型综合评分:", original_synthesis_score)
|
|
|
# 获取当前时间的字符串
|
|
|
current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
|
|
|
result_diff_df.to_csv('logs/' + self.save_result_dir + '/' + current_time + '_result_diff_test_score_'
|
|
|
+ str(synthesis_score) + '.csv', index=False)
|