diff --git a/.gitignore b/.gitignore index b157785..f1c0bc8 100644 --- a/.gitignore +++ b/.gitignore @@ -163,6 +163,8 @@ cython_debug/ /.idea/ /THUDM/ /THUDM/chatglm-6b/ +/THUDM/chatglm2-6b/ +/THUDM/chatglm3-6b/ /lora/ /ptuning/ /logs/ diff --git a/README.md b/README.md index 83714a4..7cc99b5 100644 --- a/README.md +++ b/README.md @@ -4,33 +4,51 @@ A simple program to evaluate large language model. ## Recommend Requirements -- Python 3.8 +- Python 3.10 - torch 1.13.1+cu117 - transformers 4.33.2 - accelerate 0.26.1 - tqdm 4.66.1 - openai 0.28 +- peft 0.10.0 +- google-generativeai +- pandas +- sentencepiece 0.2.0 +- rouge_chinese 1.0.3 +- jieba 0.42.1 ## 需求其余文件 - 请下载[GLM模型](https://hf-mirror.com/THUDM/chatglm-6b)并放置于到`./THUDM/chatglm-6b`文件夹下 - 请下载[GLM2模型](https://hf-mirror.com/THUDM/chatglm2-6b)并放置于到`./THUDM/chatglm2-6b`文件夹下 -- 微调后的lora模型可放置于`./lora`文件夹下,可应用于ChatGLM2 -- 微调后的ptuning模型可放置于`./ptuning`文件夹下,可应用于ChatGLM +- 请下载[GLM3模型](https://hf-mirror.com/THUDM/chatglm3-6b)并放置于到`./THUDM/chatglm3-6b`文件夹下 +- 微调后的lora模型可放置于`./lora`文件夹下,可应用于ChatGLM2,例:要应用于glm2则放置于`./lora/glm2`文件夹下 +- 微调后的ptuning模型可放置于`./ptuning`文件夹下,可应用于ChatGLM/ChatGLM2,例:要应用于glm则放置于`./ptuning/glm1`文件夹下 +- 微调后的qlora/dora模型可放置于`./qlora`文件夹下,可应用于ChatGLM3,例:要应用于glm3则放置于`./qlora/glm3`文件夹下 +- 微调文件夹名即为参数中微调模型的名称 - 训练数据按照C-Eval格式,放置于`./data`文件夹下,文件命名和`eval.py`中的`subject_name`相关 - 相较于C-Eval的数据集,代码添加了'qa'的数据集,放置于`./data/qa`文件夹下,为非选择题的问答数据集。 ## Run +运行模型评估程序: ```bash -python eval.py --model_name chatglm --cuda_device 0 --finetune ptuning1 +python eval.py --model_name chatglm3 --finetune qlora1 --finetune_method qlora --few_shot --ntrain 5 --cuda_device 0 ``` -## Arguments +对结果文件使用大模型和ROUGE进行评估,请自行修改文件内的路径: +```bash +python test.py --openai_key [your-api-key] +``` + +## Arguments(eval.py) -- `--model_name`: 模型名称,可选`chatglm`、`chatglm2` +- `--model_name`: 模型名称,可选`chatglm`、`chatglm2`、`chatglm3`、`gpt-3.5-turbo` - `--cuda_device`: GPU编号 - `--finetune`: 微调模型名称,为放置于`lora/ptuning`文件夹下的文件夹名 - `--few_shot`: 使用少量数据进行微调(可选) - `--ntrain`: 少量数据的数量(可选) - `--cot`: 使用思维链(可选) +- `--finetune_method`: 微调方法,可选`lora`、`ptuning`、`qlora`、`dora` +- `--finetune`: 微调文件夹名称,会自动寻找对应位置 +- `--openai_key`: OpenAI API Key, 调用OpenAI API进行评估或评估gpt模型需要使用 diff --git a/eval.py b/eval.py index 87f15e1..efe8856 100644 --- a/eval.py +++ b/eval.py @@ -4,17 +4,20 @@ import pandas as pd import torch from evaluators.chatgpt import ChatGPT_Evaluator from evaluators.chatglm import ChatGLM_Evaluator -from evaluators.chatglm2 import ChatGLM_Evaluator as ChatGLM2_Evaluator -from evaluators.chatglm3 import ChatGLM_Evaluator as ChatGLM3_Evaluator +from evaluators.chatglm2 import ChatGLM2_Evaluator +from evaluators.chatglm3 import ChatGLM3_Evaluator import time +from scoring.assessment_engine import AssessmentEngine + choices = ["A", "B", "C", "D"] device = torch.device("cpu") def main(args): global device + evaluator_class = None if args.cuda_device: os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device device = torch.device("cuda") @@ -28,6 +31,7 @@ def main(args): elif "chatglm3" in args.model_name: if args.finetune: fine_tune_model = args.finetune + evaluator_class = ChatGLM3_Evaluator else: fine_tune_model = None evaluator = ChatGLM3_Evaluator( @@ -41,6 +45,7 @@ def main(args): elif "chatglm2" in args.model_name: if args.finetune: fine_tune_model = args.finetune + evaluator_class = ChatGLM2_Evaluator else: fine_tune_model = None evaluator = ChatGLM2_Evaluator( @@ -54,6 +59,7 @@ def main(args): elif "chatglm" in args.model_name: if args.finetune: fine_tune_model = args.finetune + evaluator_class = ChatGLM_Evaluator else: fine_tune_model = None evaluator = ChatGLM_Evaluator( @@ -76,12 +82,12 @@ def main(args): else: fine_tune_model_name = 'original' save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}/{run_date}") - os.mkdir(save_result_dir) + os.makedirs(save_result_dir) - # subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] - subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train'] - qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] - # qa_subject_list = ['car_use', 'car_market'] + subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market'] + # subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train'] + # qa_subject_list = ['car_knowledge', 'car_use', 'car_market'] + qa_subject_list = ['car_market'] for subject_name in subject_list: print("Now testing: " + subject_name) @@ -98,11 +104,34 @@ def main(args): save_result_dir=save_result_dir) print("Acc:", correct_ratio) + # result_list = [] + # # for subject_name in qa_subject_list: # print("Now testing: " + subject_name) # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') # qa_df = pd.read_csv(qa_file_path) - # evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir) + # result_list.append(evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)) + # + # if evaluator_class is not None: + # del evaluator + # evaluator = evaluator_class( + # choices=choices, + # k=args.ntrain, + # model_name=args.model_name, + # device=device + # ) + # for index,subject_name in enumerate(qa_subject_list): + # print("Now testing (origin): " + subject_name) + # qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv') + # qa_df = pd.read_csv(qa_file_path) + # origin_result = evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir) + # origin_result = origin_result.rename(columns={"model_output": "predict_origin"}) + # result_df = result_list[index].rename(columns={"model_output": "predict_finetune"}).join(origin_result["predict_origin"]) + # result_file_name = f'{subject_name}_qa_compare_result.csv' + # result_df.to_csv(os.path.join(save_result_dir, result_file_name)) + # assessment_engine = AssessmentEngine(save_result_dir, args.openai_key) + # for subject_name in qa_subject_list: + # assessment_engine.eval_result_diff(f'{subject_name}_qa_compare_result.csv') if __name__ == "__main__": diff --git a/evaluators/chatglm.py b/evaluators/chatglm.py index 2b8ba2e..4f29f55 100644 --- a/evaluators/chatglm.py +++ b/evaluators/chatglm.py @@ -6,21 +6,14 @@ from transformers import AutoTokenizer, AutoModel, AutoConfig from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList from evaluators.evaluator import Evaluator +from evaluators.chatglm_mixin import ChatGLMMixin -class InvalidScoreLogitsProcessor(LogitsProcessor): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - if torch.isnan(scores).any() or torch.isinf(scores).any(): - scores.zero_() - scores[..., 5] = 5e4 - return scores - - -class ChatGLM_Evaluator(Evaluator): - def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None): +class ChatGLM_Evaluator(Evaluator, ChatGLMMixin): + def __init__(self, choices, k, model_name, device='cpu', finetune=None, finetune_method=None): super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) - # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem - # or directly clone the model + self.finetune_method = finetune_method + self.finetune_name = finetune self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna") if finetune_method == "ptuning": CHECKPOINT_PATH = "ptuning/glm1/" + finetune @@ -39,153 +32,4 @@ class ChatGLM_Evaluator(Evaluator): self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) print("Model loaded! (GLM original)") - # self.model = self.model.eval() - - def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): - correct_num = 0 - result = [] - score = [] - answer_list = [] - if few_shot: - history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) - else: - history = [] - answers = list(test_df['answer']) - for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): - question = self.format_example(row, include_answer=False, cot=cot) - if few_shot: - response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) - response = response.strip() - # For ChatGLM, we use answer extraction in answer-only mode too. - ans, direct_extract = self.extract_cot_answer(row, response) - else: # zero-shot by extracting answer from distribution - response, _ = self.model.chat(self.tokenizer, question, max_length=300, - do_sample=False, history=history) - response = response.strip() - ans, direct_extract = self.extract_cot_answer(row, response) - # print(response, ans) - # ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, - # history=history) - if ans == answers[row_index]: - correct_num += 1 - correct = 1 - else: - correct = 0 - if save_result_dir: - if few_shot: - result.append(response) - score.append(correct) - answer_list.append(ans) - correct_ratio = 100 * correct_num / len(answers) - - if save_result_dir: - if few_shot: - test_df['model_output'] = result - test_df['correctness'] = score - test_df['model_answer'] = answer_list - result_file_name = f'{subject_name}_{correct_ratio}_test.csv' - if few_shot: - result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv' - test_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - return correct_ratio - - def eval_qa(self, subject_name, qa_df, save_result_dir=None): - history = [] - for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)): - question = row['question'] - response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history) - response = response.strip() - qa_df.loc[row_index, 'model_output'] = response - if save_result_dir: - result_file_name = f'{subject_name}_qa_test_result.csv' - qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - def generate_few_shot_prompt(self, subject, dev_df, cot=False): - message = [] - k = self.k - if self.k == -1: - k = dev_df.shape[0] - message.append(self.format_example(dev_df.iloc[0, :], cot=cot, - add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) - for i in range(1, k): - message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) - return message - - def format_example(self, line, include_answer=True, cot=False, add_prompt=''): - example = add_prompt + line['question'] - # print(example) - for choice in self.choices: - example += f'\n{choice}. {line[f"{choice}"]}' - example += '\n答案:' - if include_answer: - if cot: - ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" - else: - ans = line["answer"] - m = (example, ans) - return m - return example - - def extract_cot_answer(self, line, gen_ans): - m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M) - if len(m) > 0 and m[-1] in self.choices: - return m[-1], True - answer_patterns = [ - r'([ABCD])是正确的', - r'选项([ABCD])正确', - r'答案为([ABCD])', - r'答案是([ABCD])', - r'答案([ABCD])', - r'选择([ABCD])', - r'答案:([ABCD])', - r'选择答案([ABCD])', - r'正确答案是([ABCD])' - ] - # RE extraction - for answer_pattern in answer_patterns: - m = re.search(answer_pattern, gen_ans, re.M) - if m: - answer = m.group(1) - return answer, False - # only containing one choice-character - m = re.findall(r'[ABCD]', gen_ans, re.M) - if len(m) == 1: - answer = m[0] - return answer, False - answer_word_counter = 0 - # only containing one choice-context - for c in self.choices: - if str(line[f'{c}']) in gen_ans: - answer = c - answer_word_counter += 1 - if answer_word_counter == 1: - return answer, False - return '-', False - - def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048, - do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): - if history is None: - history = [] - if logits_processor is None: - logits_processor = LogitsProcessorList() - logits_processor.append(InvalidScoreLogitsProcessor()) - gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048, - "temperature": temperature, "logits_processor": logits_processor, **kwargs} - if not history: - prompt = query - else: - prompt = "" - for i, (old_query, response) in enumerate(history): - prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) - prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) - inputs = tokenizer([prompt], return_tensors="pt") - inputs = inputs.to(model.device) - outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs) - - score = outputs.scores[0][0].tolist() - choice_score = [score[167], score[333], score[251], score[416]] - ranked_index = [index for index, value in - sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)] - return self.choices[ranked_index[0]] diff --git a/evaluators/chatglm2.py b/evaluators/chatglm2.py index 315d936..77adda7 100644 --- a/evaluators/chatglm2.py +++ b/evaluators/chatglm2.py @@ -3,25 +3,15 @@ import re from tqdm import tqdm import torch from transformers import AutoTokenizer, AutoModel, AutoConfig -from transformers.generation.logits_process import LogitsProcessor -from transformers.generation.utils import LogitsProcessorList from evaluators.evaluator import Evaluator +from evaluators.chatglm_mixin import ChatGLMMixin from peft import PeftModel - -class InvalidScoreLogitsProcessor(LogitsProcessor): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - if torch.isnan(scores).any() or torch.isinf(scores).any(): - scores.zero_() - scores[..., 5] = 5e4 - return scores - - -class ChatGLM_Evaluator(Evaluator): +class ChatGLM2_Evaluator(Evaluator, ChatGLMMixin): def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None): - super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) - # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem - # or directly clone the model + super(ChatGLM2_Evaluator, self).__init__(choices, model_name, k) + self.finetune_method = finetune_method + self.finetune_name = finetune self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna") if finetune_method == "lora": self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna", @@ -47,163 +37,3 @@ class ChatGLM_Evaluator(Evaluator): resume_download=True).half().to(device) print("Model loaded!(GLM2)") # self.model = self.model.eval() - - def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): - correct_num = 0 - result = [] - score = [] - answer_list = [] - if few_shot: - history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) - # print(history) - else: - # _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False) - history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')] - # print(history) - answers = list(test_df['answer']) - for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): - question = self.format_example(row, include_answer=False, cot=cot) - if few_shot: - response, _ = self.model.chat(self.tokenizer, question, max_length=300, - do_sample=False, history=history) - response = response.strip() - # For ChatGLM, we use answer extraction in answer-only mode too. - ans, direct_extract = self.extract_cot_answer(row, response) - else: # zero-shot by extracting answer from distribution - response, _ = self.model.chat(self.tokenizer, question, max_length=300, - do_sample=False, history=history) - response = response.strip() - ans, direct_extract = self.extract_cot_answer(row, response) - print(response, ans) - # ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, - # history=history) - if ans == answers[row_index]: - correct_num += 1 - correct = 1 - else: - correct = 0 - if save_result_dir: - # if few_shot: - result.append(response) - answer_list.append(ans) - score.append(correct) - correct_ratio = 100 * correct_num / len(answers) - - if save_result_dir: - # if few_shot: - test_df['model_output'] = result - test_df['correctness'] = score - test_df['model_answer'] = answer_list - result_file_name = f'{subject_name}_{correct_ratio}_test.csv' - if few_shot: - result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv' - test_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - return correct_ratio - - def eval_qa(self, subject_name, qa_df, save_result_dir=None): - # history = [] - history = [('接下来会给你一些一些汽车领域相关问题,请回答。', '好的,我会尽力解答。')] - for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)): - question = row['question'] - response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history) - # current_length = 0 - # response = "" - # for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300, - # do_sample=False, history=history): - # print(resp[current_length:], end="", flush=True) - # current_length = len(resp) - # response = resp - # print('') - response = response.strip() - qa_df.loc[row_index, 'model_output'] = response - if save_result_dir: - result_file_name = f'{subject_name}_qa_test_result.csv' - qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - def generate_few_shot_prompt(self, subject, dev_df, cot=False): - message = [] - k = self.k - if self.k == -1: - k = dev_df.shape[0] - message.append(self.format_example(dev_df.iloc[0, :], cot=cot, - add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) - for i in range(1, k): - message.append(self.format_example(dev_df.iloc[i, :], cot=cot)) - return message - - def format_example(self, line, include_answer=True, cot=False, add_prompt=''): - example = add_prompt + line['question'] - # print(example) - for choice in self.choices: - example += f'\n{choice}. {line[f"{choice}"]}' - example += '\n答案:' - if include_answer: - if cot: - ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" - else: - ans = line["answer"] - m = (example, ans) - return m - return example - - def extract_cot_answer(self, line, gen_ans): - m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M) - if len(m) > 0 and m[-1] in self.choices: - return m[-1], True - answer_patterns = [ - r'([ABCD])是正确的', - r'选项([ABCD])正确', - r'答案为([ABCD])', - r'答案是([ABCD])', - r'答案([ABCD])', - r'选择([ABCD])', - r'答案:([ABCD])', - r'选择答案([ABCD])' - ] - # RE extraction - for answer_pattern in answer_patterns: - m = re.search(answer_pattern, gen_ans, re.M) - if m: - answer = m.group(1) - return answer, False - # only containing one choice-character - m = re.findall(r'[ABCD]', gen_ans, re.M) - if len(m) == 1: - answer = m[0] - return answer, False - answer_word_counter = 0 - # only containing one choice-context - for c in self.choices: - if str(line[f'{c}']) in gen_ans: - answer = c - answer_word_counter += 1 - if answer_word_counter == 1: - return answer, False - return '-', False - - def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048, - do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): - if history is None: - history = [] - if logits_processor is None: - logits_processor = LogitsProcessorList() - logits_processor.append(InvalidScoreLogitsProcessor()) - gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048, - "temperature": temperature, "logits_processor": logits_processor, **kwargs} - if not history: - prompt = query - else: - prompt = "" - for i, (old_query, response) in enumerate(history): - prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) - prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) - inputs = tokenizer([prompt], return_tensors="pt") - inputs = inputs.to(model.device) - outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs) - - score = outputs.scores[0][0].tolist() - choice_score = [score[167], score[333], score[251], score[416]] - ranked_index = [index for index, value in - sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)] - return self.choices[ranked_index[0]] diff --git a/evaluators/chatglm3.py b/evaluators/chatglm3.py index f635b10..8c4c3b5 100644 --- a/evaluators/chatglm3.py +++ b/evaluators/chatglm3.py @@ -3,13 +3,11 @@ import re from tqdm import tqdm import torch from transformers import AutoTokenizer, AutoModel, AutoConfig -from transformers.generation.logits_process import LogitsProcessor -from transformers.generation.utils import LogitsProcessorList from evaluators.evaluator import Evaluator +from evaluators.chatglm_mixin import ChatGLMMixin from pathlib import Path from typing import Union, Tuple -import typer from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM from transformers import ( AutoModelForCausalLM, @@ -28,12 +26,12 @@ def _resolve_path(path: Union[str, Path]) -> Path: return Path(path).expanduser().resolve() -def load_model_and_tokenizer(model_dir: Union[str, Path]) -> Tuple[ModelType, TokenizerType]: +def load_model_and_tokenizer(model_dir: Union[str, Path], device) -> Tuple[ModelType, TokenizerType]: model_dir = _resolve_path(model_dir) if (model_dir / 'adapter_config.json').exists(): - config = PeftConfig.from_pretrained(model_dir) + config = PeftConfig.from_pretrained(str(model_dir)) base_model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, mirror="tuna", - resume_download=True) + resume_download=True).to(device) # base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,trust_remote_code=True, # device_map='auto') model = PeftModel.from_pretrained(base_model, model_dir) @@ -50,204 +48,19 @@ def load_model_and_tokenizer(model_dir: Union[str, Path]) -> Tuple[ModelType, To return model, tokenizer -class InvalidScoreLogitsProcessor(LogitsProcessor): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - if torch.isnan(scores).any() or torch.isinf(scores).any(): - scores.zero_() - scores[..., 5] = 5e4 - return scores - - -class ChatGLM_Evaluator(Evaluator): +class ChatGLM3_Evaluator(Evaluator, ChatGLMMixin): def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None): - super(ChatGLM_Evaluator, self).__init__(choices, model_name, k) - # try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem - # or directly clone the model - + super(ChatGLM3_Evaluator, self).__init__(choices, model_name, k) + self.finetune_method = finetune_method + self.finetune_name = finetune self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, mirror="tuna") if finetune_method == "qlora": model_dir = 'qlora/glm3/' + finetune - self.model, self.tokenizer = load_model_and_tokenizer(model_dir) + self.model, self.tokenizer = load_model_and_tokenizer(model_dir, device) self.model = self.model.half().to(device) print("Model loaded! use GLM3 " + finetune) else: self.model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device) print("Model loaded! (GLM3)") - # prompt = '以下是中国关于car_knowledge_in_train考试的单项选择题,请选出其中的正确答案。\n\n比亚迪的刀片电池采用哪种电池技术?\ - # nA. 镍氢电池\nB. 锂离子电池\nC. 磷酸铁锂电池\nD. 液态电池\n答案:' - # response, history = self.model.chat(self.tokenizer, prompt, max_length=128) - # print(history) - # current_length = 0 - # response = "" - # for resp, _ in self.model.stream_chat(self.tokenizer, prompt, max_length=300, - # do_sample=False): - # print(resp[current_length:], end="", flush=True) - # current_length = len(resp) - # response = resp - # print('') self.model = self.model.eval() - - def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): - correct_num = 0 - result = [] - score = [] - answer_list = [] - if few_shot: - history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) - else: - # _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False) - history = [{'role': 'user', - 'content': '接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。'}, - {'role': 'assistant', - 'content': '好的,我会尽力解答。'}] - answers = list(test_df['answer']) - for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): - question = self.format_example(row, include_answer=False, cot=cot) - if few_shot: - history_temp = history.copy() - response, _ = self.model.chat(self.tokenizer, question, max_length=300, - do_sample=False, history=history_temp) - response = response.strip() - # For ChatGLM, we use answer extraction in answer-only mode too. - ans, direct_extract = self.extract_cot_answer(row, response) - else: # zero-shot by extracting answer from distribution - response, _ = self.model.chat(self.tokenizer, question, max_length=300, - do_sample=False, history=history) - response = response.strip() - ans, direct_extract = self.extract_cot_answer(row, response) - print(response, ans) - # ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, - # history=history) - if ans == answers[row_index]: - correct_num += 1 - correct = 1 - else: - correct = 0 - if save_result_dir: - # if few_shot: - result.append(response) - answer_list.append(ans) - score.append(correct) - correct_ratio = 100 * correct_num / len(answers) - - if save_result_dir: - # if few_shot: - test_df['model_output'] = result - test_df['correctness'] = score - test_df['model_answer'] = answer_list - result_file_name = f'{subject_name}_{correct_ratio}_test.csv' - if few_shot: - result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv' - test_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - return correct_ratio - - def eval_qa(self, subject_name, qa_df, save_result_dir=None): - # history = [] - history = [{'role': 'user', - 'content': '接下来会给你一些一些汽车领域相关问题,请回答。'}, - {'role': 'assistant', - 'content': '好的,我会尽力解答。'}] - for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)): - question = row['question'] - response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history) - response = response.strip() - qa_df.loc[row_index, 'model_output'] = response - if save_result_dir: - result_file_name = f'{subject_name}_qa_test_result.csv' - qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - def generate_few_shot_prompt(self, subject, dev_df, cot=False): - message = [] - k = self.k - if self.k == -1: - k = dev_df.shape[0] - - message.extend(self.format_example(dev_df.iloc[0, :], cot=cot, - add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")) - for i in range(1, k): - message.extend(self.format_example(dev_df.iloc[i, :], cot=cot)) - return message - - def format_example(self, line, include_answer=True, cot=False, add_prompt=''): - example = add_prompt + line['question'] - # print(example) - for choice in self.choices: - example += f'\n{choice}. {line[f"{choice}"]}' - example += '\n答案:' - if include_answer: - if cot: - ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" - else: - ans = line["answer"] - m = [{ - 'role': 'user', - 'content': example - }, { - 'role': 'assistant', - 'content': ans - }] - return m - return example - - def extract_cot_answer(self, line, gen_ans): - m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M) - if len(m) > 0 and m[-1] in self.choices: - return m[-1], True - answer_patterns = [ - r'([ABCD])是正确的', - r'选项([ABCD])正确', - r'答案为([ABCD])', - r'答案是([ABCD])', - r'答案([ABCD])', - r'选择([ABCD])', - r'答案:([ABCD])', - r'选择答案([ABCD])' - ] - # RE extraction - for answer_pattern in answer_patterns: - m = re.search(answer_pattern, gen_ans, re.M) - if m: - answer = m.group(1) - return answer, False - # only containing one choice-character - m = re.findall(r'[ABCD]', gen_ans, re.M) - if len(m) == 1: - answer = m[0] - return answer, False - answer_word_counter = 0 - # only containing one choice-context - for c in self.choices: - if str(line[f'{c}']) in gen_ans: - answer = c - answer_word_counter += 1 - if answer_word_counter == 1: - return answer, False - return '-', False - - def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048, - do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs): - if history is None: - history = [] - if logits_processor is None: - logits_processor = LogitsProcessorList() - logits_processor.append(InvalidScoreLogitsProcessor()) - gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048, - "temperature": temperature, "logits_processor": logits_processor, **kwargs} - if not history: - prompt = query - else: - prompt = "" - for i, (old_query, response) in enumerate(history): - prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) - prompt += "[Round {}]\n问:{}\n答:".format(len(history), query) - inputs = tokenizer([prompt], return_tensors="pt") - inputs = inputs.to(model.device) - outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs) - - score = outputs.scores[0][0].tolist() - choice_score = [score[167], score[333], score[251], score[416]] - ranked_index = [index for index, value in - sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)] - return self.choices[ranked_index[0]] diff --git a/evaluators/chatglm_mixin.py b/evaluators/chatglm_mixin.py new file mode 100644 index 0000000..631f8cd --- /dev/null +++ b/evaluators/chatglm_mixin.py @@ -0,0 +1,175 @@ +import os +import re +from tqdm import tqdm +import torch +from transformers import AutoTokenizer, AutoModel, AutoConfig +from transformers.generation.logits_process import LogitsProcessor +from transformers.generation.utils import LogitsProcessorList +from evaluators.evaluator import Evaluator + +class ChatGLMMixin: + def __init__(self): + self.tokenizer = None + self.model = None + self.model_name = None + self.k = None + self.choices = None + self.finetune_name = None + + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None): + correct_num = 0 + result = [] + score = [] + answer_list = [] + if few_shot: + history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot) + else: + history = self.generate_zero_shot_prompt(is_choice_question=True) + answers = list(test_df['answer']) + for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)): + question = self.format_example(row, include_answer=False, cot=cot) + if few_shot: + response, _ = self.model.chat(self.tokenizer, question, max_length=2000, + do_sample=False, history=history) + response = response.strip() + ans, direct_extract = self.extract_cot_answer(row, response) + else: # zero-shot by extracting answer from distribution + response, _ = self.model.chat(self.tokenizer, question, max_length=2000, + do_sample=False, history=history) + response = response.strip() + ans, direct_extract = self.extract_cot_answer(row, response) + if ans == answers[row_index]: + correct_num += 1 + correct = 1 + else: + correct = 0 + if save_result_dir: + result.append(response) + score.append(correct) + answer_list.append(ans) + correct_ratio = 100 * correct_num / len(answers) + + if save_result_dir: + test_df['model_output'] = result + test_df['correctness'] = score + test_df['model_answer'] = answer_list + result_file_name = f'{subject_name}_{correct_ratio}_test.csv' + if few_shot: + result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv' + test_df.to_csv(os.path.join(save_result_dir, result_file_name)) + + return correct_ratio + + def eval_qa(self, subject_name, qa_df, save_result_dir=None): + history = self.generate_zero_shot_prompt(is_choice_question=False) + for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)): + question = row['question'] + response, _ = self.model.chat(self.tokenizer, question, max_length=2000, + do_sample=False, history=history) + response = response.strip() + qa_df.loc[row_index, 'model_output'] = response + # current_length = 0 + # response = "" + # for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300, + # do_sample=False, history=history): + # print(resp[current_length:], end="", flush=True) + # current_length = len(resp) + # response = resp + # print('') + if save_result_dir and self.finetune_name is not None: + result_file_name = f'{subject_name}_qa_test_result.csv' + qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) + return qa_df + + def generate_few_shot_prompt(self, subject, dev_df, cot=False): + message = [] + k = self.k + if self.k == -1: + k = dev_df.shape[0] + init_example = self.format_example(dev_df.iloc[0, :], cot=cot, + add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n") + if isinstance(init_example, list): + message.extend(init_example) + else: + message.append(init_example) + for i in range(1, k): + example = self.format_example(dev_df.iloc[i, :], cot=cot) + if isinstance(example, list): + message.extend(example) + else: + message.append(example) + return message + + def generate_zero_shot_prompt(self, is_choice_question=True): + if self.model_name == 'chatglm3' and is_choice_question: + return [{'role': 'user', + 'content': '接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。'}, + {'role': 'assistant', + 'content': '好的,我会尽力解答。'}] + elif self.model_name == 'chatglm3' and not is_choice_question: + return [{'role': 'user', + 'content': '接下来会给你一些一些汽车领域相关问题,请回答。'}, + {'role': 'assistant', + 'content': '好的,我会尽力解答。'}] + else: + return [] + + def format_example(self, line, include_answer=True, cot=False, add_prompt=''): + example = add_prompt + line['question'] + # print(example) + for choice in self.choices: + example += f'\n{choice}. {line[f"{choice}"]}' + example += '\n答案:' + if include_answer: + if cot: + ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}。" + else: + ans = line["answer"] + if self.model_name == 'chatglm3': + m = [{ + 'role': 'user', + 'content': example + }, { + 'role': 'assistant', + 'content': ans + }] + else: + m = (example, ans) + return m + return example + + def extract_cot_answer(self, line, gen_ans): + m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M) + if len(m) > 0 and m[-1] in self.choices: + return m[-1], True + answer_patterns = [ + r'([ABCD])是正确的', + r'选项([ABCD])正确', + r'答案为([ABCD])', + r'答案是([ABCD])', + r'答案([ABCD])', + r'选择([ABCD])', + r'答案:([ABCD])', + r'选择答案([ABCD])', + r'正确答案是([ABCD])' + ] + # RE extraction + for answer_pattern in answer_patterns: + m = re.search(answer_pattern, gen_ans, re.M) + if m: + answer = m.group(1) + return answer, False + # only containing one choice-character + m = re.findall(r'[ABCD]', gen_ans, re.M) + if len(m) == 1: + answer = m[0] + return answer, False + answer_word_counter = 0 + # only containing one choice-context + for c in self.choices: + if str(line[f'{c}']) in gen_ans: + answer = c + answer_word_counter += 1 + if answer_word_counter == 1: + return answer, False + return '-', False diff --git a/evaluators/chatgpt.py b/evaluators/chatgpt.py index 99b63a8..7025670 100644 --- a/evaluators/chatgpt.py +++ b/evaluators/chatgpt.py @@ -6,6 +6,37 @@ from time import sleep import re +def extract_ans(response_str): + pattern=[ + r"^选([A-D])", + r"^选项([A-D])", + r"答案是\s?选?项?\s?([A-D])", + r"答案为\s?选?项?\s?([A-D])", + r"答案应为\s?选?项?\s?([A-D])", + r"答案选\s?选?项?\s?([A-D])", + r"答案是:\s?选?项?\s?([A-D])", + r"答案应该是:\s?选?项?\s?([A-D])", + r"正确的一项是\s?([A-D])", + r"答案为:\s?选?项?\s?([A-D])", + r"答案应为:\s?选?项?\s?([A-D])", + r"答案:\s?选?项?\s?([A-D])", + r"答案是:\s?选?项?\s?([A-D])", + r"答案应该是:\s?选?项?\s?([A-D])", + r"答案为:\s?选?项?\s?([A-D])", + r"答案应为:\s?选?项?\s?([A-D])", + r"答案:\s?选?项?\s?([A-D])", + ] + ans_list=[] + if response_str[0] in ["A",'B','C','D']: + ans_list.append(response_str[0]) + for p in pattern: + if len(ans_list)==0: + ans_list=re.findall(p,response_str) + else: + break + return ans_list + + class ChatGPT_Evaluator(Evaluator): def __init__(self, choices, k, api_key,model_name): super(ChatGPT_Evaluator, self).__init__(choices, model_name, k) @@ -34,6 +65,7 @@ class ChatGPT_Evaluator(Evaluator): return [ {"role":"user","content":example}, ] + def generate_few_shot_prompt(self, subject, dev_df, cot=False): prompt=[ { @@ -119,7 +151,7 @@ class ChatGPT_Evaluator(Evaluator): correct=0 else: if len(response_str)>0: - ans_list=self.extract_ans(response_str) + ans_list= extract_ans(response_str) if len(ans_list)>0 and (ans_list[-1]==row["answer"]): correct_num+=1 correct=1 @@ -162,7 +194,7 @@ class ChatGPT_Evaluator(Evaluator): print(msg) sleep(5) continue - if response==None: + if response is None: response_str="" qa_df.loc[row_index, 'model_output'] = response_str else: @@ -171,33 +203,3 @@ class ChatGPT_Evaluator(Evaluator): if save_result_dir: result_file_name = f'{subject_name}_qa_test_result.csv' qa_df.to_csv(os.path.join(save_result_dir, result_file_name)) - - def extract_ans(self,response_str): - pattern=[ - r"^选([A-D])", - r"^选项([A-D])", - r"答案是\s?选?项?\s?([A-D])", - r"答案为\s?选?项?\s?([A-D])", - r"答案应为\s?选?项?\s?([A-D])", - r"答案选\s?选?项?\s?([A-D])", - r"答案是:\s?选?项?\s?([A-D])", - r"答案应该是:\s?选?项?\s?([A-D])", - r"正确的一项是\s?([A-D])", - r"答案为:\s?选?项?\s?([A-D])", - r"答案应为:\s?选?项?\s?([A-D])", - r"答案:\s?选?项?\s?([A-D])", - r"答案是:\s?选?项?\s?([A-D])", - r"答案应该是:\s?选?项?\s?([A-D])", - r"答案为:\s?选?项?\s?([A-D])", - r"答案应为:\s?选?项?\s?([A-D])", - r"答案:\s?选?项?\s?([A-D])", - ] - ans_list=[] - if response_str[0] in ["A",'B','C','D']: - ans_list.append(response_str[0]) - for p in pattern: - if len(ans_list)==0: - ans_list=re.findall(p,response_str) - else: - break - return ans_list diff --git a/evaluators/evaluator.py b/evaluators/evaluator.py index 46a6dc7..1c70c4a 100644 --- a/evaluators/evaluator.py +++ b/evaluators/evaluator.py @@ -7,31 +7,6 @@ class Evaluator: self.k = k self.puncs = list(string.punctuation) - def format_example(self, line, include_answer=True): - example = line['question'] - # print(example) - for choice in self.choices: - example += f'\n{choice}. {line[f"{choice}"]}' - example += '\n答案:' - if include_answer: - example += f'{line["answer"]}\n\n' - return example - - def generate_few_shot_prompt(self, subject, dev_df): - prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n" - k = self.k - if self.k == -1: - k = dev_df.shape[0] - for i in range(k): - prompt += self.format_example(dev_df.iloc[i, :]) - return prompt - - def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None): - pass - - def eval_qa(self, subject_name, qa_df, save_result_dir=None): - pass - def normalize_answer(self,s): def white_space_fix(text): diff --git a/re_extract.py b/re_extract.py new file mode 100644 index 0000000..6134d0b --- /dev/null +++ b/re_extract.py @@ -0,0 +1,67 @@ +import math + +import pandas as pd +from scoring.gpt_scorer import GPTScorer, extract_score + +machine_score_df = pd.read_csv('logs/other/20240408181951_result_diff_test_score_82.95347116717225.csv') + +gpt_scorer = GPTScorer("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") +finetune_rouge_score_sum = 0 +origin_rouge_score_sum = 0 +finetune_acc_score_sum = 0 +origin_acc_score_sum = 0 +finetune_fluency_score_sum = 0 +origin_fluency_score_sum = 0 +model_better_score_sum = 0 +row_count = 0 + +for row_index, row in machine_score_df.iterrows(): + row_count += 1 + response_text = row['acc_response_finetune'] + print(response_text) + score = extract_score(response_text) + machine_score_df.loc[row_index, 'acc_finetune'] = score + finetune_acc_score_sum += float(score) + response_text = row['acc_response_origin'] + score = extract_score(response_text) + machine_score_df.loc[row_index, 'acc_origin'] = score + origin_acc_score_sum += float(score) + response_text = row['fluency_response_finetune'] + score = extract_score(response_text) + machine_score_df.loc[row_index, 'fluency_finetune'] = score + finetune_fluency_score_sum += float(score) + response_text = row['fluency_response_origin'] + score = extract_score(response_text) + machine_score_df.loc[row_index, 'fluency_origin'] = score + origin_fluency_score_sum += float(score) + response_text = row['diff_score_response'] + score = extract_score(response_text) + machine_score_df.loc[row_index, 'diff_score'] = score + model_better_score_sum += float(score) + + origin_rouge_1_f_score = row['rouge_score_origin'] + origin_rouge_score_sum += origin_rouge_1_f_score + finetune_rouge_1_f_score = row['rouge_score_finetune'] + finetune_rouge_score_sum += finetune_rouge_1_f_score + +machine_score_df.to_csv('logs/other/re_20240408181951_result_diff_test_score_82.95347116717225.csv', index=False) +# synthesis_rouge_score = finetune_rouge_score_sum / row_count +# original_rouge_score = origin_rouge_score_sum / row_count +# synthesis_acc_score = finetune_acc_score_sum / row_count +# original_acc_score = origin_acc_score_sum / row_count +# synthesis_fluency_score = finetune_fluency_score_sum / row_count +# original_fluency_score = origin_fluency_score_sum / row_count +# synthesis_diff_score = model_better_score_sum / row_count +# print("微调模型ROUGE分数:", synthesis_rouge_score) +# print("原模型ROUGE分数:", original_rouge_score) +# print("微调模型准确性分数:", synthesis_acc_score) +# print("原模型准确性分数:", original_acc_score) +# print("微调模型流畅度分数:", synthesis_fluency_score) +# print("原模型流畅度分数:", original_fluency_score) +# print("微调模型优于原模型分数:", synthesis_diff_score) +# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 + +# synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4 +# print("综合评分:", synthesis_score) +# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 + +# original_fluency_score * 100 / 3 + 66) / 4 +# print("原模型综合评分:", original_synthesis_score) diff --git a/scoring/assessment_engine.py b/scoring/assessment_engine.py index 4a845e1..2195a77 100644 --- a/scoring/assessment_engine.py +++ b/scoring/assessment_engine.py @@ -11,7 +11,7 @@ class AssessmentEngine: self.gpt_scorer = GPTScorer(api_key) def eval_subject(self, subject_name, csv_file_name): - qa_result_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name) + qa_result_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name) start_time = time.time() row_count = 0 rouge_score_sum = 0 @@ -33,11 +33,17 @@ class AssessmentEngine: elapsed_time = end_time - start_time print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") synthesis_score = rouge_score_sum / row_count - qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_' + qa_result_df.to_csv(self.save_result_dir + '/' + subject_name + '_qa_test_score_' + str(synthesis_score) + '.csv', index=False) - def eval_result_diff(self, csv_file_name): - result_diff_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name) + def eval_result_diff(self, csv_file_name, file_type='csv'): + if file_type == 'json': + result_diff_df = pd.read_json(self.save_result_dir + '/' + csv_file_name) + elif file_type == 'csv': + result_diff_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name) + else: + print("Unknown file type:" + file_type) + return result_diff_df['rouge_score_finetune'] = 0 result_diff_df['rouge_score_origin'] = 0 result_diff_df['acc_finetune'] = 0 @@ -118,7 +124,7 @@ class AssessmentEngine: result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff if (gpt_score_diff is not None) and gpt_score_diff.isdigit(): model_better_score_sum += float(gpt_score_diff) - result_diff_df.to_csv('logs/' + self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False) + result_diff_df.to_csv(self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False) end_time = time.time() elapsed_time = end_time - start_time @@ -145,5 +151,60 @@ class AssessmentEngine: print("原模型综合评分:", original_synthesis_score) # 获取当前时间的字符串 current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) - result_diff_df.to_csv('logs/' + self.save_result_dir + '/' + current_time + '_result_diff_test_score_' + result_diff_df.to_csv(self.save_result_dir + '/' + current_time + '_result_diff_test_score_' + str(synthesis_score) + '.csv', index=False) + + def eval_result(self, file_name, file_type='csv'): + start_time = time.time() + if file_type == 'json': + result_df = pd.read_json(self.save_result_dir + '/' + file_name) + elif file_type == 'csv': + result_df = pd.read_csv(self.save_result_dir + '/' + file_name) + else: + print("Unsupported file type:" + file_type) + return + result_df['rouge_score_finetune'] = 0 + result_df['acc_finetune'] = 0 + result_df['fluency_finetune'] = 0 + result_df['acc_response_finetune'] = 0 + result_df['fluency_response_finetune'] = 0 + rouge_score_sum = 0 + acc_score_sum = 0 + fluency_score_sum = 0 + row_count = 0 + for row_index, row in tqdm(result_df.iterrows(), total=len(result_df)): + row_count += 1 + test_question = row['question'] + model_response = row['Predict'] + reference_answer = row['answer'] + rouge_score = get_rouge_score(model_response, reference_answer) + rouge_1_f_score = rouge_score['rouge-1']['f'] + rouge_score_sum += rouge_1_f_score + result_df.loc[row_index, 'rouge_score_finetune'] = rouge_1_f_score + self.gpt_scorer.mode("accuracy") + gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question, + model_response, reference_answer) + result_df.loc[row_index, 'acc_finetune'] = gpt_score_acc + result_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc + if (gpt_score_acc is not None) and gpt_score_acc.isdigit(): + acc_score_sum += float(gpt_score_acc) + self.gpt_scorer.mode("fluency") + gpt_response_fluency, gpt_score_fluency = self.gpt_scorer.score_with_chatgpt(test_question, + model_response, reference_answer) + result_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency + result_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency + if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit(): + fluency_score_sum += float(gpt_score_fluency) + result_df.to_csv(self.save_result_dir + '/result_test_score_tmp.csv', index=False) + end_time = time.time() + elapsed_time = end_time - start_time + print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "秒") + rouge_score = rouge_score_sum / row_count + acc_score = acc_score_sum / row_count + fluency_score = fluency_score_sum / row_count + print("ROUGE分数:", rouge_score) + print("准确性分数:", acc_score) + print("流畅度分数:", fluency_score) + synthesis_score = (rouge_score * 100 + acc_score * 100 / 4 + fluency_score * 100 / 3 + 66) / 4 + print("综合评分:", synthesis_score) + result_df.to_csv(self.save_result_dir + f'/result_test_score_{synthesis_score}.csv', index=False) diff --git a/scoring/gemini_scorer.py b/scoring/gemini_scorer.py new file mode 100644 index 0000000..39a67d1 --- /dev/null +++ b/scoring/gemini_scorer.py @@ -0,0 +1,7 @@ +import google.generativeai as genai + +genai.configure(api_key='AIzaSyAW_h8itGLwNhYTfx1EDLthhcHHlcIfs7w') +model = genai.GenerativeModel('gemini-pro') + +response = model.generate_content("Write a story about a magic backpack.") +print(response.text) diff --git a/scoring/gpt_scorer.py b/scoring/gpt_scorer.py index 703cede..3c60b01 100644 --- a/scoring/gpt_scorer.py +++ b/scoring/gpt_scorer.py @@ -2,6 +2,39 @@ import openai import re +def extract_score(response_text): + response_text = str(response_text) + # 提取评分 + pattern = [ + r"^评分为([1-5])分", + r"评分:([1-5])分", + r"评分为([1-5])" + ] + score_list = [] + for p in pattern: + if len(score_list) == 0: + score_list = re.findall(p, response_text) + else: + break + if len(score_list) == 0: + return '3' + return score_list[0] + + +def request_gpt(prompt, retries=3): + ordinal = lambda n: str(n) + {1: "st", 2: "nd", 3: "rd"}.get(10 <= n % 100 <= 20 and n or n % 10, "th") + for i in range(retries): + try: + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=prompt, + ) + return response.choices[0]['message']['content'] + except Exception as e: + print(f"An error occurred while scoring with ChatGPT: {e}, it's the {ordinal(i+1)} time.") + return None + + class GPTScorer: def __init__(self, api_key): openai.api_key = api_key @@ -17,32 +50,17 @@ class GPTScorer: def score_with_chatgpt(self, question, model_result, reference, origin_model_result=None): prompt = self.generate_scoring_prompt(question, model_result, reference, origin_model_result) try: - # 提交文本以获取ChatGPT评分 - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo", - messages=prompt, - ) - # 提取评分 - chatgpt_response = response.choices[0]['message']['content'] - chatgpt_score = self.extract_score(chatgpt_response) + chatgpt_response = request_gpt(prompt, retries=5) + chatgpt_score = extract_score(chatgpt_response) return chatgpt_response, chatgpt_score except Exception as e: - print("An error occurred while scoring with ChatGPT:", e) + print("An error occurred while extract score:", e) return None, '2' def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None): # 生成评分提示 base_prompt = [] if self.eval_mode == "accuracy": - # base_prompt = [{ - # "role": "system", - # "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。" - # "请对比参考答案和大模型生成结果,从信息准确性的角度评分以下生成的结果,以评估其质量。满分为5分。" - # "评分标准为:信息准确无误——5分。信息大致符合实际信息——4分。" - # "信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关或回答语句不完整——1分。" - # "可以根据实际情况稍作调整。" - # "回复格式为:评分为x分。理由:xxx。" - # }] base_prompt = [{ "role": "system", "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。" @@ -90,23 +108,6 @@ class GPTScorer: } ] return prompt - # AIzaSyAW_h8itGLwNhYTfx1EDLthhcHHlcIfs7w (google) - - def extract_score(self, response_text): - # 提取评分 - pattern = [ - r"^评分为([1-5])分", - r"评分:([1-5])分", - ] - score_list = [] - for p in pattern: - if len(score_list) == 0: - score_list = re.findall(p, response_text) - else: - break - if len(score_list) == 0: - return '3' - return score_list[0] # 示例用法 @@ -116,17 +117,19 @@ if __name__ == "__main__": # 初始化模型评分器 scorer = GPTScorer(my_api_key) - # 要评分的大模型结果 - sample_question = "秦Plus-DMi车型的安全气囊有哪些类型?" - sample_model_result = ("截止到我最后更新知识的时候,关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。" - "通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。" - "但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。" - "建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。") - sample_reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。" + print(extract_score('理由:参考答案与生成的结果完全一致,信息准确无误。因此,评分为4分。')) - # 获取ChatGPT评分 - response_text, score = scorer.mode('accuracy').score_with_chatgpt(sample_question, sample_model_result, sample_reference) - if response_text is not None: - print("ChatGPT评分:", score, "\nChatGPT回复:", response_text) - else: - print("无法获取ChatGPT评分。") + # 要评分的大模型结果 + # sample_question = "秦Plus-DMi车型的安全气囊有哪些类型?" + # sample_model_result = ("截止到我最后更新知识的时候,关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。" + # "通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。" + # "但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。" + # "建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。") + # sample_reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。" + # + # # 获取ChatGPT评分 + # response_text, score = scorer.mode('accuracy').score_with_chatgpt(sample_question, sample_model_result, sample_reference) + # if response_text is not None: + # print("ChatGPT评分:", score, "\nChatGPT回复:", response_text) + # else: + # print("无法获取ChatGPT评分。") diff --git a/test.py b/test.py index 6b5c489..c735f28 100644 --- a/test.py +++ b/test.py @@ -1,28 +1,37 @@ +import argparse from scoring.assessment_engine import AssessmentEngine -assessment_engine = AssessmentEngine("other", - "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") -assessment_engine.eval_result_diff("0408output-dora.csv") +def main(args): + assessment_engine = AssessmentEngine("logs/other",args.openai_key) + assessment_engine.eval_result_diff("0319output.csv") + assessment_engine.eval_result("output-pt-sft.json", "json") -# synthesis_rouge_score = 0.30358589506467687 -# print("微调模型ROUGE分数:", synthesis_rouge_score) -# original_rouge_score = 0.26004000118452175 -# print("原模型ROUGE分数:", original_rouge_score) -# synthesis_acc_score = 2.768 -# print("微调模型准确性分数:", synthesis_acc_score) -# original_acc_score = 2.724 -# print("原模型准确性分数:", original_acc_score) -# synthesis_fluency_score = 2.098 -# print("微调模型流畅度分数:", synthesis_fluency_score) -# original_fluency_score = 2.236 -# print("原模型流畅度分数:", original_fluency_score) -# synthesis_diff_score = 2.278 -# print("微调模型优于原模型分数:", synthesis_diff_score) -# -# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100/4 + synthesis_fluency_score * 100/3 -# + synthesis_diff_score * 100/3 ) / 4 -# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100/4 + -# original_fluency_score * 100/3 + 66 ) / 4 -# -# print("综合评分:", synthesis_score) -# print("原模型综合评分:", original_synthesis_score) + # synthesis_rouge_score = 0.30358589506467687 + # print("微调模型ROUGE分数:", synthesis_rouge_score) + # original_rouge_score = 0.26004000118452175 + # print("原模型ROUGE分数:", original_rouge_score) + # synthesis_acc_score = 2.768 + # print("微调模型准确性分数:", synthesis_acc_score) + # original_acc_score = 2.724 + # print("原模型准确性分数:", original_acc_score) + # synthesis_fluency_score = 2.098 + # print("微调模型流畅度分数:", synthesis_fluency_score) + # original_fluency_score = 2.236 + # print("原模型流畅度分数:", original_fluency_score) + # synthesis_diff_score = 2.278 + # print("微调模型优于原模型分数:", synthesis_diff_score) + # + # synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100/4 + synthesis_fluency_score * 100/3 + # + synthesis_diff_score * 100/3 ) / 4 + # original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100/4 + + # original_fluency_score * 100/3 + 66 ) / 4 + # + # print("综合评分:", synthesis_score) + # print("原模型综合评分:", original_synthesis_score) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--openai_key", type=str, default="xxx") + user_args = parser.parse_args() + main(user_args)