|
|
import math
|
|
|
|
|
|
import pandas as pd
|
|
|
from scoring.gpt_scorer import GPTScorer, extract_score
|
|
|
|
|
|
machine_score_df = pd.read_csv('logs/other/20240408181951_result_diff_test_score_82.95347116717225.csv')
|
|
|
|
|
|
gpt_scorer = GPTScorer("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")
|
|
|
finetune_rouge_score_sum = 0
|
|
|
origin_rouge_score_sum = 0
|
|
|
finetune_acc_score_sum = 0
|
|
|
origin_acc_score_sum = 0
|
|
|
finetune_fluency_score_sum = 0
|
|
|
origin_fluency_score_sum = 0
|
|
|
model_better_score_sum = 0
|
|
|
row_count = 0
|
|
|
|
|
|
for row_index, row in machine_score_df.iterrows():
|
|
|
row_count += 1
|
|
|
response_text = row['acc_response_finetune']
|
|
|
print(response_text)
|
|
|
score = extract_score(response_text)
|
|
|
machine_score_df.loc[row_index, 'acc_finetune'] = score
|
|
|
finetune_acc_score_sum += float(score)
|
|
|
response_text = row['acc_response_origin']
|
|
|
score = extract_score(response_text)
|
|
|
machine_score_df.loc[row_index, 'acc_origin'] = score
|
|
|
origin_acc_score_sum += float(score)
|
|
|
response_text = row['fluency_response_finetune']
|
|
|
score = extract_score(response_text)
|
|
|
machine_score_df.loc[row_index, 'fluency_finetune'] = score
|
|
|
finetune_fluency_score_sum += float(score)
|
|
|
response_text = row['fluency_response_origin']
|
|
|
score = extract_score(response_text)
|
|
|
machine_score_df.loc[row_index, 'fluency_origin'] = score
|
|
|
origin_fluency_score_sum += float(score)
|
|
|
response_text = row['diff_score_response']
|
|
|
score = extract_score(response_text)
|
|
|
machine_score_df.loc[row_index, 'diff_score'] = score
|
|
|
model_better_score_sum += float(score)
|
|
|
|
|
|
origin_rouge_1_f_score = row['rouge_score_origin']
|
|
|
origin_rouge_score_sum += origin_rouge_1_f_score
|
|
|
finetune_rouge_1_f_score = row['rouge_score_finetune']
|
|
|
finetune_rouge_score_sum += finetune_rouge_1_f_score
|
|
|
|
|
|
machine_score_df.to_csv('logs/other/re_20240408181951_result_diff_test_score_82.95347116717225.csv', index=False)
|
|
|
# synthesis_rouge_score = finetune_rouge_score_sum / row_count
|
|
|
# original_rouge_score = origin_rouge_score_sum / row_count
|
|
|
# synthesis_acc_score = finetune_acc_score_sum / row_count
|
|
|
# original_acc_score = origin_acc_score_sum / row_count
|
|
|
# synthesis_fluency_score = finetune_fluency_score_sum / row_count
|
|
|
# original_fluency_score = origin_fluency_score_sum / row_count
|
|
|
# synthesis_diff_score = model_better_score_sum / row_count
|
|
|
# print("微调模型ROUGE分数:", synthesis_rouge_score)
|
|
|
# print("原模型ROUGE分数:", original_rouge_score)
|
|
|
# print("微调模型准确性分数:", synthesis_acc_score)
|
|
|
# print("原模型准确性分数:", original_acc_score)
|
|
|
# print("微调模型流畅度分数:", synthesis_fluency_score)
|
|
|
# print("原模型流畅度分数:", original_fluency_score)
|
|
|
# print("微调模型优于原模型分数:", synthesis_diff_score)
|
|
|
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
|
|
|
# synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
|
|
|
# print("综合评分:", synthesis_score)
|
|
|
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
|
|
|
# original_fluency_score * 100 / 3 + 66) / 4
|
|
|
# print("原模型综合评分:", original_synthesis_score)
|