From 148a9e1de03a749d9553d1b6816afce27d167ac9 Mon Sep 17 00:00:00 2001 From: PeterAlbus Date: Tue, 12 Mar 2024 13:23:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=84=E5=88=86=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eval.py | 37 +++++----- evaluators/chatglm2.py | 86 ++++++++++++++++++----- evaluators/chatgpt.py | 31 ++++++++ scoring/__init__.py | 2 - scoring/assessment_engine.py | 37 ++++++++++ scoring/{gpt_scoring.py => gpt_scorer.py} | 42 ++++++----- scoring/{rogue.py => rogue_scorer.py} | 4 +- test.py | 7 ++ 8 files changed, 189 insertions(+), 57 deletions(-) create mode 100644 scoring/assessment_engine.py rename scoring/{gpt_scoring.py => gpt_scorer.py} (61%) rename scoring/{rogue.py => rogue_scorer.py} (93%) create mode 100644 test.py diff --git a/eval.py b/eval.py index 9a8f2a8..e0328f2 100644 --- a/eval.py +++ b/eval.py @@ -34,7 +34,8 @@ def main(args): k=args.ntrain, model_name=args.model_name, device=device, - finetune=fine_tune_model + finetune=fine_tune_model, + finetune_method=args.finetune_method ) elif "chatglm" in args.model_name: if args.cuda_device: @@ -67,21 +68,22 @@ def main(args): subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] + # qa_subject_list = ['car_use', 'car_market'== - for subject_name in subject_list: - print("Now testing: " + subject_name) - # subject_name=args.subject - val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') - val_df = pd.read_csv(val_file_path) - if args.few_shot: - dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv') - dev_df = pd.read_csv(dev_file_path) - correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot, - save_result_dir=save_result_dir, cot=args.cot) - else: - correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot, - save_result_dir=save_result_dir) - print("Acc:", correct_ratio) + # for subject_name in subject_list: + # print("Now testing: " + subject_name) + # # subject_name=args.subject + # val_file_path = os.path.join('data/val', f'{subject_name}_val.csv') + # val_df = pd.read_csv(val_file_path) + # if args.few_shot: + # dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv') + # dev_df = pd.read_csv(dev_file_path) + # correct_ratio = evaluator.eval_subject(subject_name, val_df, dev_df, few_shot=args.few_shot, + # save_result_dir=save_result_dir, cot=args.cot) + # else: + # correct_ratio = evaluator.eval_subject(subject_name, val_df, few_shot=args.few_shot, + # save_result_dir=save_result_dir) + # print("Acc:", correct_ratio) for subject_name in qa_subject_list: print("Now testing: " + subject_name) @@ -102,5 +104,6 @@ if __name__ == "__main__": # parser.add_argument("--subject","-s",type=str,default="operating_system") parser.add_argument("--cuda_device", type=str) parser.add_argument("--finetune", type=str) - args = parser.parse_args() - main(args) + parser.add_argument("--finetune_method", type=str) + user_args = parser.parse_args() + main(user_args) diff --git a/evaluators/chatglm2.py b/evaluators/chatglm2.py index 03864bd..783a046 100644 --- a/evaluators/chatglm2.py +++ b/evaluators/chatglm2.py @@ -8,6 +8,7 @@ from transformers.generation.utils import LogitsProcessorList from evaluators.evaluator import Evaluator from peft import PeftModel + class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): @@ -15,57 +16,82 @@ class InvalidScoreLogitsProcessor(LogitsProcessor): scores[..., 5] = 5e4 return scores + class ChatGLM_Evaluator(Evaluator): - def __init__(self, choices, k, model_name, device, finetune=None): + def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None): super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem # or directly clone the model self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna") - self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) - if finetune: + if finetune_method == "lora": + self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", + resume_download=True).half().to(device) peft_model_id = "lora/" + finetune self.model = PeftModel.from_pretrained(self.model, peft_model_id) print("Model loaded! use GLM2" + finetune) + elif finetune_method == "ptuning": + CHECKPOINT_PATH = "ptuning/" + finetune + config = AutoConfig.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, pre_seq_len=128) + self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", config=config, trust_remote_code=True) + prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin")) + new_prefix_state_dict = {} + for k, v in prefix_state_dict.items(): + if k.startswith("transformer.prefix_encoder."): + new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v + self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict) + self.model = self.model.half().to(device) + self.model.transformer.prefix_encoder.float() + print("Model loaded! use GLM2 + " + finetune) else: + self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", + resume_download=True).half().to(device) print("Model loaded!(GLM2)") # self.model = self.model.eval() def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): correct_num = 0 - if save_result_dir: - if few_shot: - result = [] - score = [] - answer_list = [] + result = [] + score = [] + answer_list = [] if few_shot: history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) + print(history) else: - history = [] + # _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False) + history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')] + # print(history) answers = list(test_df['answer']) for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): question = self.format_example(row, include_answer=False, cot=cot) if few_shot: - response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) + response, _ = self.model.chat(self.tokenizer, question, max_length=300, + do_sample=False, history=history) response = response.strip() # For ChatGLM, we use answer extraction in answer-only mode too. ans, direct_extract = self.extract_cot_answer(row, response) - else: # zero-shot by extracting answer from distribution - ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history) + else: # zero-shot by extracting answer from distribution + response, _ = self.model.chat(self.tokenizer, question, max_length=300, + do_sample=False, history=history) + response = response.strip() + ans, direct_extract = self.extract_cot_answer(row, response) + print(response, ans) + # ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, + # history=history) if ans == answers[row_index]: correct_num += 1 correct = 1 else: correct = 0 if save_result_dir: - if few_shot: - result.append(response) + # if few_shot: + result.append(response) answer_list.append(ans) score.append(correct) - correct_ratio = 100*correct_num/len(answers) + correct_ratio = 100 * correct_num / len(answers) if save_result_dir: - if few_shot: - test_df['model_output'] = result + # if few_shot: + test_df['model_output'] = result test_df['correctness'] = score test_df['model_answer'] = answer_list result_file_name = f'{subject_name}_{correct_ratio}_test.csv' @@ -75,12 +101,33 @@ class ChatGLM_Evaluator(Evaluator): return correct_ratio + def eval_qa(self, subject_name, qa_df, save_result_dir=None): + # history = [] + history = [('接下来会给你一些一些汽车领域相关问题,请回答。', '好的,我会尽力解答。')] + for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)): + question = row['question'] + response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history) + # current_length = 0 + # response = "" + # for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300, + # do_sample=False, history=history): + # print(resp[current_length:], end="", flush=True) + # current_length = len(resp) + # response = resp + # print('') + response = response.strip() + qa_df.loc[row_index, 'model_output'] = response + if save_result_dir: + result_file_name = f'{subject_name}_qa_test_result.csv' + qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) + def generate_few_shot_prompt(self, subject, dev_df, cot=False): message = [] k = self.k if self.k == -1: k = dev_df.shape[0] - message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) + message.append(self.format_example(dev_df.iloc[0, :], cot=cot, + add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) for i in range(1, k): message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) return message @@ -157,5 +204,6 @@ class ChatGLM_Evaluator(Evaluator): score = outputs.scores[0][0].tolist() choice_score = [score[167], score[333], score[251], score[416]] - ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)] + ranked_index = [index for index, value in + sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)] return self.choices[ranked_index[0]] diff --git a/evaluators/chatgpt.py b/evaluators/chatgpt.py index aa5fc66..99b63a8 100644 --- a/evaluators/chatgpt.py +++ b/evaluators/chatgpt.py @@ -141,6 +141,37 @@ class ChatGPT_Evaluator(Evaluator): test_df.to_csv(os.path.join(save_result_dir, result_file_name),encoding="utf-8",index=False) return correct_ratio + def eval_qa(self, subject_name, qa_df, save_result_dir=None): + for row_index, row in tqdm(qa_df.iterrows(),total=len(qa_df)): + question = [ + {"role":"user","content":row['question']} + ] + full_prompt = question + response=None + timeout_counter=0 + while response is None and timeout_counter<=30: + try: + response = openai.ChatCompletion.create( + model=self.model_name, + messages=full_prompt, + temperature=0. + ) + except Exception as msg: + if "timeout=600" in str(msg): + timeout_counter+=1 + print(msg) + sleep(5) + continue + if response==None: + response_str="" + qa_df.loc[row_index, 'model_output'] = response_str + else: + response_str = response['choices'][0]['message']['content'] + qa_df.loc[row_index, 'model_output'] = response_str + if save_result_dir: + result_file_name = f'{subject_name}_qa_test_result.csv' + qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) + def extract_ans(self,response_str): pattern=[ r"^选([A-D])", diff --git a/scoring/__init__.py b/scoring/__init__.py index 5384cd7..b28b04f 100644 --- a/scoring/__init__.py +++ b/scoring/__init__.py @@ -1,5 +1,3 @@ -# rogue -from rogue import get_rouge_score diff --git a/scoring/assessment_engine.py b/scoring/assessment_engine.py new file mode 100644 index 0000000..5c16612 --- /dev/null +++ b/scoring/assessment_engine.py @@ -0,0 +1,37 @@ +from scoring.gpt_scorer import GPTScorer +from scoring.rogue_scorer import get_rouge_score +import pandas as pd +import time +from tqdm import tqdm + + +class AssessmentEngine: + def __init__(self, save_result_dir, api_key): + self.save_result_dir = save_result_dir + self.gpt_scorer = GPTScorer(api_key) + + def eval_subject(self, subject_name, csv_file_name): + qa_result_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name) + start_time = time.time() + row_count = 0 + rouge_score_sum = 0 + for row_index, row in tqdm(qa_result_df.iterrows(), total=len(qa_result_df)): + row_count += 1 + test_question = row['question'] + model_response = row['model_output'] + reference_answer = row['answer'] + rouge_score = get_rouge_score(model_response, reference_answer) + rouge_1_f_score = rouge_score['rouge-1']['f'] + rouge_score_sum += rouge_1_f_score + qa_result_df.loc[row_index, 'rouge_score'] = rouge_1_f_score + self.gpt_scorer.mode("accuracy") + gpt_score_acc, gpt_response_acc = self.gpt_scorer.score_with_chatgpt(test_question, + model_response, reference_answer) + qa_result_df.loc[row_index, 'gpt_score_acc'] = gpt_score_acc + qa_result_df.loc[row_index, 'gpt_response_acc'] = gpt_response_acc + end_time = time.time() + elapsed_time = end_time - start_time + print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") + synthesis_score = rouge_score_sum / row_count + qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_' + + str(synthesis_score) + '.csv', index=False) diff --git a/scoring/gpt_scoring.py b/scoring/gpt_scorer.py similarity index 61% rename from scoring/gpt_scoring.py rename to scoring/gpt_scorer.py index 305e9bf..4e29fe3 100644 --- a/scoring/gpt_scoring.py +++ b/scoring/gpt_scorer.py @@ -1,25 +1,31 @@ import openai import re -class ModelScorer: + +class GPTScorer: def __init__(self, api_key): openai.api_key = api_key self.eval_mode = "accuracy" - def score_with_chatgpt(self, text): + def mode(self, mode): + self.eval_mode = mode + return self + + def score_with_chatgpt(self, question, model_result, reference): + prompt = self.generate_scoring_prompt(question, model_result, reference) try: # 提交文本以获取ChatGPT评分 response = openai.ChatCompletion.create( model="gpt-3.5-turbo", - messages=text, + messages=prompt, ) # 提取评分 chatgpt_response = response.choices[0]['message']['content'] chatgpt_score = self.extract_score(chatgpt_response) - return chatgpt_response,chatgpt_score + return chatgpt_response, chatgpt_score except Exception as e: print("An error occurred while scoring with ChatGPT:", e) - return None + return None, None def generate_scoring_prompt(self, question, model_result, reference): # 生成评分提示 @@ -44,13 +50,13 @@ class ModelScorer: def extract_score(self, response_text): # 提取评分 - pattern=[ + pattern = [ r"^评分为([1-5])分", ] - score_list=[] + score_list = [] for p in pattern: - if len(score_list)==0: - score_list=re.findall(p,response_text) + if len(score_list) == 0: + score_list = re.findall(p, response_text) else: break return score_list[0] @@ -61,17 +67,19 @@ if __name__ == "__main__": my_api_key = "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" # 初始化模型评分器 - scorer = ModelScorer(my_api_key) + scorer = GPTScorer(my_api_key) # 要评分的大模型结果 - question = "秦Plus-DMi车型的安全气囊有哪些类型?" - model_result = "截止到我最后更新知识的时候,关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。" - reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。" + sample_question = "秦Plus-DMi车型的安全气囊有哪些类型?" + sample_model_result = ("截止到我最后更新知识的时候,关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。" + "通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。" + "但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。" + "建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。") + sample_reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。" - prompt = scorer.generate_scoring_prompt(question, model_result, reference) # 获取ChatGPT评分 - response,score = scorer.score_with_chatgpt(prompt) - if response is not None: - print("ChatGPT评分:", score, "\nChatGPT回复:", response) + response_text, score = scorer.mode('accuracy').score_with_chatgpt(sample_question, sample_model_result, sample_reference) + if response_text is not None: + print("ChatGPT评分:", score, "\nChatGPT回复:", response_text) else: print("无法获取ChatGPT评分。") diff --git a/scoring/rogue.py b/scoring/rogue_scorer.py similarity index 93% rename from scoring/rogue.py rename to scoring/rogue_scorer.py index 0655323..1fe7484 100644 --- a/scoring/rogue.py +++ b/scoring/rogue_scorer.py @@ -7,8 +7,8 @@ def get_rouge_score(s1, s2): rouge = Rouge() s1 = " ".join(jieba.cut(s1)) s2 = " ".join(jieba.cut(s2)) - print(s1) - print(s2) + # print(s1) + # print(s2) return rouge.get_scores(s1, s2)[0] diff --git a/test.py b/test.py new file mode 100644 index 0000000..02b2137 --- /dev/null +++ b/test.py @@ -0,0 +1,7 @@ +from scoring.assessment_engine import AssessmentEngine + +assessment_engine = AssessmentEngine("chatglm2_glm2_pt1_2024-03-08_11-24-47", + "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") +assessment_engine.eval_subject("car_knowledge", "car_knowledge_qa_test_result.csv") +assessment_engine.eval_subject("car_use", "car_use_qa_test_result.csv") +assessment_engine.eval_subject("car_market", "car_market_qa_test_result.csv")