From a603aa64cf3031d3a22176f892c39074d69fe304 Mon Sep 17 00:00:00 2001 From: PeterAlbus Date: Wed, 17 Apr 2024 16:51:23 +0800 Subject: [PATCH] =?UTF-8?q?bug=E4=BF=AE=E5=A4=8D=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eval.py | 14 +++++++------- merge.py | 28 ++++++++++++++++++++++++++++ scoring/assessment_engine.py | 34 +++++++++++++++++----------------- scoring/gpt_scorer.py | 17 +++++++++-------- scoring/rogue_scorer.py | 2 -- test.py | 6 +++--- 6 files changed, 64 insertions(+), 37 deletions(-) create mode 100644 merge.py diff --git a/eval.py b/eval.py index efe8856..275c977 100644 --- a/eval.py +++ b/eval.py @@ -85,7 +85,7 @@ def main(args): os.makedirs(save_result_dir) subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] - # subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train'] + subject_list.extend(['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train']) # qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] qa_subject_list = ['car_market'] @@ -104,14 +104,14 @@ def main(args): save_result_dir=save_result_dir) print("Acc:", correct_ratio) - # result_list = [] - # + result_list = [] + # for subject_name in qa_subject_list: # print("Now testing: " + subject_name) # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') # qa_df = pd.read_csv(qa_file_path) # result_list.append(evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)) - # + # if evaluator_class is not None: # del evaluator # evaluator = evaluator_class( @@ -129,9 +129,9 @@ def main(args): # result_df = result_list[index].rename(columns={"model_output": "predict_finetune"}).join(origin_result["predict_origin"]) # result_file_name = f'{subject_name}_qa_compare_result.csv' # result_df.to_csv(os.path.join(save_result_dir, result_file_name)) - # assessment_engine = AssessmentEngine(save_result_dir, args.openai_key) - # for subject_name in qa_subject_list: - # assessment_engine.eval_result_diff(f'{subject_name}_qa_compare_result.csv') + # assessment_engine = AssessmentEngine(save_result_dir, args.openai_key) + # for subject_name in qa_subject_list: + # assessment_engine.eval_result_diff(f'{subject_name}_qa_compare_result.csv') if __name__ == "__main__": diff --git a/merge.py b/merge.py new file mode 100644 index 0000000..b87484e --- /dev/null +++ b/merge.py @@ -0,0 +1,28 @@ +import pandas as pd +import os + +def merge_df(save_result_dir, df1_name, df2_name, df1_row_name, df1_row_new_name, df2_row_name, df2_row_new_name, file_type='csv'): + df1_path = str(os.path.join(save_result_dir, df1_name)) + df2_path = str(os.path.join(save_result_dir, df2_name)) + if file_type == 'csv': + df1 = pd.read_csv(df1_path) + df2 = pd.read_csv(df2_path) + elif file_type == 'json': + df1 = pd.read_json(df1_path) + df2 = pd.read_json(df2_path) + else: + raise ValueError("Invalid file type. Please choose either 'csv' or 'json'.") + df2 = df2.rename(columns={df2_row_name: df2_row_new_name}) + df1 = df1.rename(columns={df1_row_name: df1_row_new_name}).join(df2[df2_row_new_name]) + result_file_name = f'{df1_name}_{df2_name}_merge.csv' + df1.to_csv(os.path.join(save_result_dir, result_file_name)) + + +if __name__ == "__main__": + dir_path = 'logs/pt_sft' + pt_file_name = 'output-pt-sft-1-0.95-0.5-1.2.json' + npt_file_name = 'output-npt-sft-1-0.95-0.5-1.2.json' + predict_row_name = 'Predict' + pt_predict_row_name = 'predict_finetune' + npt_predict_row_name = 'predict_origin' + merge_df(dir_path, pt_file_name, npt_file_name, predict_row_name, pt_predict_row_name, predict_row_name, npt_predict_row_name, 'json') diff --git a/scoring/assessment_engine.py b/scoring/assessment_engine.py index 2195a77..cc49c26 100644 --- a/scoring/assessment_engine.py +++ b/scoring/assessment_engine.py @@ -44,18 +44,18 @@ class AssessmentEngine: else: print("Unknown file type:" + file_type) return - result_diff_df['rouge_score_finetune'] = 0 - result_diff_df['rouge_score_origin'] = 0 - result_diff_df['acc_finetune'] = 0 - result_diff_df['acc_origin'] = 0 - result_diff_df['fluency_finetune'] = 0 - result_diff_df['fluency_origin'] = 0 - result_diff_df['diff_score'] = 0 - result_diff_df['acc_response_finetune'] = 0 - result_diff_df['acc_response_origin'] = 0 - result_diff_df['fluency_response_finetune'] = 0 - result_diff_df['fluency_response_origin'] = 0 - result_diff_df['diff_score_response'] = 0 + result_diff_df['rouge_score_finetune'] = '0' + result_diff_df['rouge_score_origin'] = '0' + result_diff_df['acc_finetune'] = '0' + result_diff_df['acc_origin'] = '0' + result_diff_df['fluency_finetune'] = '0' + result_diff_df['fluency_origin'] = '0' + result_diff_df['diff_score'] = '0' + result_diff_df['acc_response_finetune'] = '0' + result_diff_df['acc_response_origin'] = '0' + result_diff_df['fluency_response_finetune'] = '0' + result_diff_df['fluency_response_origin'] = '0' + result_diff_df['diff_score_response'] = '0' start_time = time.time() finetune_rouge_score_sum = 0 origin_rouge_score_sum = 0 @@ -163,11 +163,11 @@ class AssessmentEngine: else: print("Unsupported file type:" + file_type) return - result_df['rouge_score_finetune'] = 0 - result_df['acc_finetune'] = 0 - result_df['fluency_finetune'] = 0 - result_df['acc_response_finetune'] = 0 - result_df['fluency_response_finetune'] = 0 + result_df['rouge_score_finetune'] = '0' + result_df['acc_finetune'] = '0' + result_df['fluency_finetune'] = '0' + result_df['acc_response_finetune'] = '0' + result_df['fluency_response_finetune'] = '0' rouge_score_sum = 0 acc_score_sum = 0 fluency_score_sum = 0 diff --git a/scoring/gpt_scorer.py b/scoring/gpt_scorer.py index 3c60b01..e91c22e 100644 --- a/scoring/gpt_scorer.py +++ b/scoring/gpt_scorer.py @@ -31,7 +31,8 @@ def request_gpt(prompt, retries=3): ) return response.choices[0]['message']['content'] except Exception as e: - print(f"An error occurred while scoring with ChatGPT: {e}, it's the {ordinal(i+1)} time.") + print(f"\nAn error occurred while scoring with ChatGPT: {e}, it's the {ordinal(i+1)} time.") + print("Failed to get response from ChatGPT. Use default score.") return None @@ -54,7 +55,7 @@ class GPTScorer: chatgpt_score = extract_score(chatgpt_response) return chatgpt_response, chatgpt_score except Exception as e: - print("An error occurred while extract score:", e) + print("\nAn error occurred while extract score:", e) return None, '2' def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None): @@ -82,12 +83,12 @@ class GPTScorer: elif self.eval_mode == "diff": base_prompt = [{ "role": "system", - "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型生成的结果和一个微调后大模型生成结果。" - "请对比这些结果,判断微调后大模型的结果是否优于原模型。满分为3分。" + "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案、一个大模型1生成的结果和一个大模型2生成的结果。" + "请对比这些结果,判断大模型2的结果和大模型1哪个更好。满分为3分。" "信息的准确性应当被首要考虑,多余的未知真假的信息不应该带来加分。" "对比时请关注结果和参考答案的契合度。" - "评分标准为:认为回答优于原模型——3分。认为回答与原模型持平——2分。" - "认为回答不如原模型——1分。" + "评分标准为:认为大模型2的结果更好——3分。认为两者结果持平——2分。" + "认为大模型1的结果更好——1分。" "回复格式为:理由:xxx。因此,评分为x分。" }] if self.eval_mode == "diff": @@ -96,8 +97,8 @@ class GPTScorer: prompt = base_prompt + [ { "role": "user", - "content": f"问题:{question}\n\n原模型生成的结果:{origin_model_result}\n\n" - f"微调后模型生成的结果:{model_result}\n\n参考答案:{reference}" + "content": f"问题:{question}\n\n大模型1生成的结果:{origin_model_result}\n\n" + f"大模型2生成的结果:{model_result}\n\n参考答案:{reference}" } ] else: diff --git a/scoring/rogue_scorer.py b/scoring/rogue_scorer.py index 1fe7484..15d8962 100644 --- a/scoring/rogue_scorer.py +++ b/scoring/rogue_scorer.py @@ -7,8 +7,6 @@ def get_rouge_score(s1, s2): rouge = Rouge() s1 = " ".join(jieba.cut(s1)) s2 = " ".join(jieba.cut(s2)) - # print(s1) - # print(s2) return rouge.get_scores(s1, s2)[0] diff --git a/test.py b/test.py index c735f28..0f95eed 100644 --- a/test.py +++ b/test.py @@ -2,9 +2,9 @@ import argparse from scoring.assessment_engine import AssessmentEngine def main(args): - assessment_engine = AssessmentEngine("logs/other",args.openai_key) - assessment_engine.eval_result_diff("0319output.csv") - assessment_engine.eval_result("output-pt-sft.json", "json") + assessment_engine = AssessmentEngine("logs/lxh",args.openai_key) + assessment_engine.eval_result_diff("0416_DoRA_generated_predictions.csv") + # assessment_engine.eval_result("output-npt-sft-1-0.95-0.5-1.2.json", "json") # synthesis_rouge_score = 0.30358589506467687 # print("微调模型ROUGE分数:", synthesis_rouge_score)