import math import pandas as pd from scoring.gpt_scorer import GPTScorer, extract_score machine_score_df = pd.read_csv('logs/other/20240408181951_result_diff_test_score_82.95347116717225.csv') gpt_scorer = GPTScorer("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") finetune_rouge_score_sum = 0 origin_rouge_score_sum = 0 finetune_acc_score_sum = 0 origin_acc_score_sum = 0 finetune_fluency_score_sum = 0 origin_fluency_score_sum = 0 model_better_score_sum = 0 row_count = 0 for row_index, row in machine_score_df.iterrows(): row_count += 1 response_text = row['acc_response_finetune'] print(response_text) score = extract_score(response_text) machine_score_df.loc[row_index, 'acc_finetune'] = score finetune_acc_score_sum += float(score) response_text = row['acc_response_origin'] score = extract_score(response_text) machine_score_df.loc[row_index, 'acc_origin'] = score origin_acc_score_sum += float(score) response_text = row['fluency_response_finetune'] score = extract_score(response_text) machine_score_df.loc[row_index, 'fluency_finetune'] = score finetune_fluency_score_sum += float(score) response_text = row['fluency_response_origin'] score = extract_score(response_text) machine_score_df.loc[row_index, 'fluency_origin'] = score origin_fluency_score_sum += float(score) response_text = row['diff_score_response'] score = extract_score(response_text) machine_score_df.loc[row_index, 'diff_score'] = score model_better_score_sum += float(score) origin_rouge_1_f_score = row['rouge_score_origin'] origin_rouge_score_sum += origin_rouge_1_f_score finetune_rouge_1_f_score = row['rouge_score_finetune'] finetune_rouge_score_sum += finetune_rouge_1_f_score machine_score_df.to_csv('logs/other/re_20240408181951_result_diff_test_score_82.95347116717225.csv', index=False) # synthesis_rouge_score = finetune_rouge_score_sum / row_count # original_rouge_score = origin_rouge_score_sum / row_count # synthesis_acc_score = finetune_acc_score_sum / row_count # original_acc_score = origin_acc_score_sum / row_count # synthesis_fluency_score = finetune_fluency_score_sum / row_count # original_fluency_score = origin_fluency_score_sum / row_count # synthesis_diff_score = model_better_score_sum / row_count # print("微调模型ROUGE分数:", synthesis_rouge_score) # print("原模型ROUGE分数:", original_rouge_score) # print("微调模型准确性分数:", synthesis_acc_score) # print("原模型准确性分数:", original_acc_score) # print("微调模型流畅度分数:", synthesis_fluency_score) # print("原模型流畅度分数:", original_fluency_score) # print("微调模型优于原模型分数:", synthesis_diff_score) # synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 + # synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4 # print("综合评分:", synthesis_score) # original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 + # original_fluency_score * 100 / 3 + 66) / 4 # print("原模型综合评分:", original_synthesis_score)