You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

211 lines
12 KiB
Python

from scoring.gpt_scorer import GPTScorer
from scoring.rogue_scorer import get_rouge_score
import pandas as pd
import time
from tqdm import tqdm
class AssessmentEngine:
def __init__(self, save_result_dir, api_key):
self.save_result_dir = save_result_dir
self.gpt_scorer = GPTScorer(api_key)
def eval_subject(self, subject_name, csv_file_name):
qa_result_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name)
start_time = time.time()
row_count = 0
rouge_score_sum = 0
for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)):
row_count += 1
test_question = row['question']
model_response = row['model_output']
reference_answer = row['answer']
rouge_score = get_rouge_score(model_response, reference_answer)
rouge_1_f_score = rouge_score['rouge-1']['f']
rouge_score_sum += rouge_1_f_score
qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc
qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_score = rouge_score_sum / row_count
qa_result_df.to_csv(self.save_result_dir + '/' + subject_name + '_qa_test_score_'
+ str(synthesis_score) + '.csv', index=False)
def eval_result_diff(self, csv_file_name, file_type='csv'):
if file_type == 'json':
result_diff_df = pd.read_json(self.save_result_dir + '/' + csv_file_name)
elif file_type == 'csv':
result_diff_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name)
else:
print("Unknown file type:" + file_type)
return
result_diff_df['rouge_score_finetune'] = 0
result_diff_df['rouge_score_origin'] = 0
result_diff_df['acc_finetune'] = 0
result_diff_df['acc_origin'] = 0
result_diff_df['fluency_finetune'] = 0
result_diff_df['fluency_origin'] = 0
result_diff_df['diff_score'] = 0
result_diff_df['acc_response_finetune'] = 0
result_diff_df['acc_response_origin'] = 0
result_diff_df['fluency_response_finetune'] = 0
result_diff_df['fluency_response_origin'] = 0
result_diff_df['diff_score_response'] = 0
start_time = time.time()
finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0
finetune_acc_score_sum = 0
origin_acc_score_sum = 0
finetune_fluency_score_sum = 0
origin_fluency_score_sum = 0
model_better_score_sum = 0
row_count = 0
for row_index, row in tqdm(result_diff_df.iterrows(), total=len(result_diff_df)):
if row['question'] == '':
continue
row_count += 1
test_question = row['question']
finetune_model_response = row['predict_finetune']
original_model_response = row['predict_origin']
reference_answer = row['answer']
# 计算ROUGE分数
finetune_rouge_score = get_rouge_score(finetune_model_response, reference_answer)
finetune_rouge_1_f_score = finetune_rouge_score['rouge-1']['f']
finetune_rouge_score_sum += finetune_rouge_1_f_score
result_diff_df.loc[row_index, 'rouge_score_finetune'] = finetune_rouge_1_f_score
origin_rouge_score = get_rouge_score(original_model_response, reference_answer)
origin_rouge_1_f_score = origin_rouge_score['rouge-1']['f']
origin_rouge_score_sum += origin_rouge_1_f_score
result_diff_df.loc[row_index, 'rouge_score_origin'] = origin_rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer))
result_diff_df.loc[row_index, 'acc_finetune'] = gpt_score_acc
result_diff_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
finetune_acc_score_sum += float(gpt_score_acc)
gpt_response_acc, gpt_score_acc = (self.gpt_scorer.score_with_chatgpt(test_question,
original_model_response,
reference_answer))
result_diff_df.loc[row_index, 'acc_origin'] = gpt_score_acc
result_diff_df.loc[row_index, 'acc_response_origin'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
origin_acc_score_sum += float(gpt_score_acc)
self.gpt_scorer.mode("fluency")
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer))
result_diff_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency
result_diff_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
finetune_fluency_score_sum += float(gpt_score_fluency)
gpt_response_fluency, gpt_score_fluency = (self.gpt_scorer.score_with_chatgpt(test_question,
original_model_response,
reference_answer))
result_diff_df.loc[row_index, 'fluency_origin'] = gpt_score_fluency
result_diff_df.loc[row_index, 'fluency_response_origin'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
origin_fluency_score_sum += float(gpt_score_fluency)
self.gpt_scorer.mode("diff")
gpt_response_diff, gpt_score_diff = (self.gpt_scorer.score_with_chatgpt(test_question,
finetune_model_response,
reference_answer,
original_model_response))
result_diff_df.loc[row_index, 'diff_score'] = gpt_score_diff
result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff
if (gpt_score_diff is not None) and gpt_score_diff.isdigit():
model_better_score_sum += float(gpt_score_diff)
result_diff_df.to_csv(self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False)
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_rouge_score = finetune_rouge_score_sum / row_count
original_rouge_score = origin_rouge_score_sum / row_count
synthesis_acc_score = finetune_acc_score_sum / row_count
original_acc_score = origin_acc_score_sum / row_count
synthesis_fluency_score = finetune_fluency_score_sum / row_count
original_fluency_score = origin_fluency_score_sum / row_count
synthesis_diff_score = model_better_score_sum / row_count
print("微调模型ROUGE分数", synthesis_rouge_score)
print("原模型ROUGE分数", original_rouge_score)
print("微调模型准确性分数:", synthesis_acc_score)
print("原模型准确性分数:", original_acc_score)
print("微调模型流畅度分数:", synthesis_fluency_score)
print("原模型流畅度分数:", original_fluency_score)
print("微调模型优于原模型分数:", synthesis_diff_score)
synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
print("综合评分:", synthesis_score)
original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
original_fluency_score * 100 / 3 + 66) / 4
print("原模型综合评分:", original_synthesis_score)
# 获取当前时间的字符串
current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
result_diff_df.to_csv(self.save_result_dir + '/' + current_time + '_result_diff_test_score_'
+ str(synthesis_score) + '.csv', index=False)
def eval_result(self, file_name, file_type='csv'):
start_time = time.time()
if file_type == 'json':
result_df = pd.read_json(self.save_result_dir + '/' + file_name)
elif file_type == 'csv':
result_df = pd.read_csv(self.save_result_dir + '/' + file_name)
else:
print("Unsupported file type:" + file_type)
return
result_df['rouge_score_finetune'] = 0
result_df['acc_finetune'] = 0
result_df['fluency_finetune'] = 0
result_df['acc_response_finetune'] = 0
result_df['fluency_response_finetune'] = 0
rouge_score_sum = 0
acc_score_sum = 0
fluency_score_sum = 0
row_count = 0
for row_index, row in tqdm(result_df.iterrows(), total=len(result_df)):
row_count += 1
test_question = row['question']
model_response = row['Predict']
reference_answer = row['answer']
rouge_score = get_rouge_score(model_response, reference_answer)
rouge_1_f_score = rouge_score['rouge-1']['f']
rouge_score_sum += rouge_1_f_score
result_df.loc[row_index, 'rouge_score_finetune'] = rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
result_df.loc[row_index, 'acc_finetune'] = gpt_score_acc
result_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
acc_score_sum += float(gpt_score_acc)
self.gpt_scorer.mode("fluency")
gpt_response_fluency, gpt_score_fluency = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
result_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency
result_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
fluency_score_sum += float(gpt_score_fluency)
result_df.to_csv(self.save_result_dir + '/result_test_score_tmp.csv', index=False)
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
rouge_score = rouge_score_sum / row_count
acc_score = acc_score_sum / row_count
fluency_score = fluency_score_sum / row_count
print("ROUGE分数", rouge_score)
print("准确性分数:", acc_score)
print("流畅度分数:", fluency_score)
synthesis_score = (rouge_score * 100 + acc_score * 100 / 4 + fluency_score * 100 / 3 + 66) / 4
print("综合评分:", synthesis_score)
result_df.to_csv(self.save_result_dir + f'/result_test_score_{synthesis_score}.csv', index=False)