You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
3.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import math
import pandas as pd
from scoring.gpt_scorer import GPTScorer, extract_score
machine_score_df = pd.read_csv('logs/other/20240408181951_result_diff_test_score_82.95347116717225.csv')
gpt_scorer = GPTScorer("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")
finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0
finetune_acc_score_sum = 0
origin_acc_score_sum = 0
finetune_fluency_score_sum = 0
origin_fluency_score_sum = 0
model_better_score_sum = 0
row_count = 0
for row_index, row in machine_score_df.iterrows():
row_count += 1
response_text = row['acc_response_finetune']
print(response_text)
score = extract_score(response_text)
machine_score_df.loc[row_index, 'acc_finetune'] = score
finetune_acc_score_sum += float(score)
response_text = row['acc_response_origin']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'acc_origin'] = score
origin_acc_score_sum += float(score)
response_text = row['fluency_response_finetune']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'fluency_finetune'] = score
finetune_fluency_score_sum += float(score)
response_text = row['fluency_response_origin']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'fluency_origin'] = score
origin_fluency_score_sum += float(score)
response_text = row['diff_score_response']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'diff_score'] = score
model_better_score_sum += float(score)
origin_rouge_1_f_score = row['rouge_score_origin']
origin_rouge_score_sum += origin_rouge_1_f_score
finetune_rouge_1_f_score = row['rouge_score_finetune']
finetune_rouge_score_sum += finetune_rouge_1_f_score
machine_score_df.to_csv('logs/other/re_20240408181951_result_diff_test_score_82.95347116717225.csv', index=False)
# synthesis_rouge_score = finetune_rouge_score_sum / row_count
# original_rouge_score = origin_rouge_score_sum / row_count
# synthesis_acc_score = finetune_acc_score_sum / row_count
# original_acc_score = origin_acc_score_sum / row_count
# synthesis_fluency_score = finetune_fluency_score_sum / row_count
# original_fluency_score = origin_fluency_score_sum / row_count
# synthesis_diff_score = model_better_score_sum / row_count
# print("微调模型ROUGE分数", synthesis_rouge_score)
# print("原模型ROUGE分数", original_rouge_score)
# print("微调模型准确性分数:", synthesis_acc_score)
# print("原模型准确性分数:", original_acc_score)
# print("微调模型流畅度分数:", synthesis_fluency_score)
# print("原模型流畅度分数:", original_fluency_score)
# print("微调模型优于原模型分数:", synthesis_diff_score)
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
# synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
# print("综合评分:", synthesis_score)
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
# original_fluency_score * 100 / 3 + 66) / 4
# print("原模型综合评分:", original_synthesis_score)