bug修复。

main
PeterAlbus 7 months ago
parent af8bd1e046
commit a603aa64cf

@ -85,7 +85,7 @@ def main(args):
os.makedirs(save_result_dir) os.makedirs(save_result_dir)
subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
# subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train'] subject_list.extend(['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train'])
# qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] # qa_subject_list = ['car_knowledge', 'car_use', 'car_market']
qa_subject_list = ['car_market'] qa_subject_list = ['car_market']
@ -104,14 +104,14 @@ def main(args):
save_result_dir=save_result_dir) save_result_dir=save_result_dir)
print("Acc:", correct_ratio) print("Acc:", correct_ratio)
# result_list = [] result_list = []
#
# for subject_name in qa_subject_list: # for subject_name in qa_subject_list:
# print("Now testing: " + subject_name) # print("Now testing: " + subject_name)
# qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv')
# qa_df = pd.read_csv(qa_file_path) # qa_df = pd.read_csv(qa_file_path)
# result_list.append(evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)) # result_list.append(evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir))
#
# if evaluator_class is not None: # if evaluator_class is not None:
# del evaluator # del evaluator
# evaluator = evaluator_class( # evaluator = evaluator_class(

@ -0,0 +1,28 @@
import pandas as pd
import os
def merge_df(save_result_dir, df1_name, df2_name, df1_row_name, df1_row_new_name, df2_row_name, df2_row_new_name, file_type='csv'):
df1_path = str(os.path.join(save_result_dir, df1_name))
df2_path = str(os.path.join(save_result_dir, df2_name))
if file_type == 'csv':
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)
elif file_type == 'json':
df1 = pd.read_json(df1_path)
df2 = pd.read_json(df2_path)
else:
raise ValueError("Invalid file type. Please choose either 'csv' or 'json'.")
df2 = df2.rename(columns={df2_row_name: df2_row_new_name})
df1 = df1.rename(columns={df1_row_name: df1_row_new_name}).join(df2[df2_row_new_name])
result_file_name = f'{df1_name}_{df2_name}_merge.csv'
df1.to_csv(os.path.join(save_result_dir, result_file_name))
if __name__ == "__main__":
dir_path = 'logs/pt_sft'
pt_file_name = 'output-pt-sft-1-0.95-0.5-1.2.json'
npt_file_name = 'output-npt-sft-1-0.95-0.5-1.2.json'
predict_row_name = 'Predict'
pt_predict_row_name = 'predict_finetune'
npt_predict_row_name = 'predict_origin'
merge_df(dir_path, pt_file_name, npt_file_name, predict_row_name, pt_predict_row_name, predict_row_name, npt_predict_row_name, 'json')

@ -44,18 +44,18 @@ class AssessmentEngine:
else: else:
print("Unknown file type:" + file_type) print("Unknown file type:" + file_type)
return return
result_diff_df['rouge_score_finetune'] = 0 result_diff_df['rouge_score_finetune'] = '0'
result_diff_df['rouge_score_origin'] = 0 result_diff_df['rouge_score_origin'] = '0'
result_diff_df['acc_finetune'] = 0 result_diff_df['acc_finetune'] = '0'
result_diff_df['acc_origin'] = 0 result_diff_df['acc_origin'] = '0'
result_diff_df['fluency_finetune'] = 0 result_diff_df['fluency_finetune'] = '0'
result_diff_df['fluency_origin'] = 0 result_diff_df['fluency_origin'] = '0'
result_diff_df['diff_score'] = 0 result_diff_df['diff_score'] = '0'
result_diff_df['acc_response_finetune'] = 0 result_diff_df['acc_response_finetune'] = '0'
result_diff_df['acc_response_origin'] = 0 result_diff_df['acc_response_origin'] = '0'
result_diff_df['fluency_response_finetune'] = 0 result_diff_df['fluency_response_finetune'] = '0'
result_diff_df['fluency_response_origin'] = 0 result_diff_df['fluency_response_origin'] = '0'
result_diff_df['diff_score_response'] = 0 result_diff_df['diff_score_response'] = '0'
start_time = time.time() start_time = time.time()
finetune_rouge_score_sum = 0 finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0 origin_rouge_score_sum = 0
@ -163,11 +163,11 @@ class AssessmentEngine:
else: else:
print("Unsupported file type:" + file_type) print("Unsupported file type:" + file_type)
return return
result_df['rouge_score_finetune'] = 0 result_df['rouge_score_finetune'] = '0'
result_df['acc_finetune'] = 0 result_df['acc_finetune'] = '0'
result_df['fluency_finetune'] = 0 result_df['fluency_finetune'] = '0'
result_df['acc_response_finetune'] = 0 result_df['acc_response_finetune'] = '0'
result_df['fluency_response_finetune'] = 0 result_df['fluency_response_finetune'] = '0'
rouge_score_sum = 0 rouge_score_sum = 0
acc_score_sum = 0 acc_score_sum = 0
fluency_score_sum = 0 fluency_score_sum = 0

@ -31,7 +31,8 @@ def request_gpt(prompt, retries=3):
) )
return response.choices[0]['message']['content'] return response.choices[0]['message']['content']
except Exception as e: except Exception as e:
print(f"An error occurred while scoring with ChatGPT: {e}, it's the {ordinal(i+1)} time.") print(f"\nAn error occurred while scoring with ChatGPT: {e}, it's the {ordinal(i+1)} time.")
print("Failed to get response from ChatGPT. Use default score.")
return None return None
@ -54,7 +55,7 @@ class GPTScorer:
chatgpt_score = extract_score(chatgpt_response) chatgpt_score = extract_score(chatgpt_response)
return chatgpt_response, chatgpt_score return chatgpt_response, chatgpt_score
except Exception as e: except Exception as e:
print("An error occurred while extract score:", e) print("\nAn error occurred while extract score:", e)
return None, '2' return None, '2'
def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None): def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None):
@ -82,12 +83,12 @@ class GPTScorer:
elif self.eval_mode == "diff": elif self.eval_mode == "diff":
base_prompt = [{ base_prompt = [{
"role": "system", "role": "system",
"content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型生成的结果和一个微调后大模型生成结果。" "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型1生成的结果和一个大模型2生成结果。"
"请对比这些结果,判断微调后大模型的结果是否优于原模型。满分为3分。" "请对比这些结果,判断大模型2的结果和大模型1哪个更好。满分为3分。"
"信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。" "信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。"
"对比时请关注结果和参考答案的契合度。" "对比时请关注结果和参考答案的契合度。"
"评分标准为:认为回答优于原模型——3分。认为回答与原模型持平——2分。" "评分标准为:认为大模型2的结果更好——3分。认为两者结果持平——2分。"
"认为回答不如原模型——1分。" "认为大模型1的结果更好——1分。"
"回复格式为理由xxx。因此评分为x分。" "回复格式为理由xxx。因此评分为x分。"
}] }]
if self.eval_mode == "diff": if self.eval_mode == "diff":
@ -96,8 +97,8 @@ class GPTScorer:
prompt = base_prompt + [ prompt = base_prompt + [
{ {
"role": "user", "role": "user",
"content": f"问题:{question}\n\n原模型生成的结果:{origin_model_result}\n\n" "content": f"问题:{question}\n\n大模型1生成的结果:{origin_model_result}\n\n"
f"微调后模型生成的结果:{model_result}\n\n参考答案:{reference}" f"大模型2生成的结果:{model_result}\n\n参考答案:{reference}"
} }
] ]
else: else:

@ -7,8 +7,6 @@ def get_rouge_score(s1, s2):
rouge = Rouge() rouge = Rouge()
s1 = " ".join(jieba.cut(s1)) s1 = " ".join(jieba.cut(s1))
s2 = " ".join(jieba.cut(s2)) s2 = " ".join(jieba.cut(s2))
# print(s1)
# print(s2)
return rouge.get_scores(s1, s2)[0] return rouge.get_scores(s1, s2)[0]

@ -2,9 +2,9 @@ import argparse
from scoring.assessment_engine import AssessmentEngine from scoring.assessment_engine import AssessmentEngine
def main(args): def main(args):
assessment_engine = AssessmentEngine("logs/other",args.openai_key) assessment_engine = AssessmentEngine("logs/lxh",args.openai_key)
assessment_engine.eval_result_diff("0319output.csv") assessment_engine.eval_result_diff("0416_DoRA_generated_predictions.csv")
assessment_engine.eval_result("output-pt-sft.json", "json") # assessment_engine.eval_result("output-npt-sft-1-0.95-0.5-1.2.json", "json")
# synthesis_rouge_score = 0.30358589506467687 # synthesis_rouge_score = 0.30358589506467687
# print("微调模型ROUGE分数", synthesis_rouge_score) # print("微调模型ROUGE分数", synthesis_rouge_score)

Loading…
Cancel
Save