You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
3.1 KiB
Python

import math
import pandas as pd
from scoring.gpt_scorer import GPTScorer, extract_score
machine_score_df = pd.read_csv('logs/other/20240408181951_result_diff_test_score_82.95347116717225.csv')
gpt_scorer = GPTScorer("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")
finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0
finetune_acc_score_sum = 0
origin_acc_score_sum = 0
finetune_fluency_score_sum = 0
origin_fluency_score_sum = 0
model_better_score_sum = 0
row_count = 0
for row_index, row in machine_score_df.iterrows():
row_count += 1
response_text = row['acc_response_finetune']
print(response_text)
score = extract_score(response_text)
machine_score_df.loc[row_index, 'acc_finetune'] = score
finetune_acc_score_sum += float(score)
response_text = row['acc_response_origin']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'acc_origin'] = score
origin_acc_score_sum += float(score)
response_text = row['fluency_response_finetune']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'fluency_finetune'] = score
finetune_fluency_score_sum += float(score)
response_text = row['fluency_response_origin']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'fluency_origin'] = score
origin_fluency_score_sum += float(score)
response_text = row['diff_score_response']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'diff_score'] = score
model_better_score_sum += float(score)
origin_rouge_1_f_score = row['rouge_score_origin']
origin_rouge_score_sum += origin_rouge_1_f_score
finetune_rouge_1_f_score = row['rouge_score_finetune']
finetune_rouge_score_sum += finetune_rouge_1_f_score
machine_score_df.to_csv('logs/other/re_20240408181951_result_diff_test_score_82.95347116717225.csv', index=False)
# synthesis_rouge_score = finetune_rouge_score_sum / row_count
# original_rouge_score = origin_rouge_score_sum / row_count
# synthesis_acc_score = finetune_acc_score_sum / row_count
# original_acc_score = origin_acc_score_sum / row_count
# synthesis_fluency_score = finetune_fluency_score_sum / row_count
# original_fluency_score = origin_fluency_score_sum / row_count
# synthesis_diff_score = model_better_score_sum / row_count
# print("微调模型ROUGE分数", synthesis_rouge_score)
# print("原模型ROUGE分数", original_rouge_score)
# print("微调模型准确性分数:", synthesis_acc_score)
# print("原模型准确性分数:", original_acc_score)
# print("微调模型流畅度分数:", synthesis_fluency_score)
# print("原模型流畅度分数:", original_fluency_score)
# print("微调模型优于原模型分数:", synthesis_diff_score)
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
# synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
# print("综合评分:", synthesis_score)
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
# original_fluency_score * 100 / 3 + 66) / 4
# print("原模型综合评分:", original_synthesis_score)