优化代码结构,减少重复代码,增加复用模块。

完善评估流程,保证选择题->问答题->大模型评估全流程覆盖。
main
PeterAlbus 7 months ago
parent 68e59328f6
commit af8bd1e046

2
.gitignore vendored

@ -163,6 +163,8 @@ cython_debug/
/.idea/
/THUDM/
/THUDM/chatglm-6b/
/THUDM/chatglm2-6b/
/THUDM/chatglm3-6b/
/lora/
/ptuning/
/logs/

@ -4,33 +4,51 @@ A simple program to evaluate large language model.
## Recommend Requirements
- Python 3.8
- Python 3.10
- torch 1.13.1+cu117
- transformers 4.33.2
- accelerate 0.26.1
- tqdm 4.66.1
- openai 0.28
- peft 0.10.0
- google-generativeai
- pandas
- sentencepiece 0.2.0
- rouge_chinese 1.0.3
- jieba 0.42.1
## 需求其余文件
- 请下载[GLM模型](https://hf-mirror.com/THUDM/chatglm-6b)并放置于到`./THUDM/chatglm-6b`文件夹下
- 请下载[GLM2模型](https://hf-mirror.com/THUDM/chatglm2-6b)并放置于到`./THUDM/chatglm2-6b`文件夹下
- 微调后的lora模型可放置于`./lora`文件夹下可应用于ChatGLM2
- 微调后的ptuning模型可放置于`./ptuning`文件夹下可应用于ChatGLM
- 请下载[GLM3模型](https://hf-mirror.com/THUDM/chatglm3-6b)并放置于到`./THUDM/chatglm3-6b`文件夹下
- 微调后的lora模型可放置于`./lora`文件夹下可应用于ChatGLM2要应用于glm2则放置于`./lora/glm2`文件夹下
- 微调后的ptuning模型可放置于`./ptuning`文件夹下可应用于ChatGLM/ChatGLM2要应用于glm则放置于`./ptuning/glm1`文件夹下
- 微调后的qlora/dora模型可放置于`./qlora`文件夹下可应用于ChatGLM3要应用于glm3则放置于`./qlora/glm3`文件夹下
- 微调文件夹名即为参数中微调模型的名称
- 训练数据按照C-Eval格式放置于`./data`文件夹下,文件命名和`eval.py`中的`subject_name`相关
- 相较于C-Eval的数据集代码添加了'qa'的数据集,放置于`./data/qa`文件夹下,为非选择题的问答数据集。
## Run
运行模型评估程序:
```bash
python eval.py --model_name chatglm --cuda_device 0 --finetune ptuning1
python eval.py --model_name chatglm3 --finetune qlora1 --finetune_method qlora --few_shot --ntrain 5 --cuda_device 0
```
## Arguments
对结果文件使用大模型和ROUGE进行评估请自行修改文件内的路径
```bash
python test.py --openai_key [your-api-key]
```
## Arguments(eval.py)
- `--model_name`: 模型名称,可选`chatglm`、`chatglm2`
- `--model_name`: 模型名称,可选`chatglm`、`chatglm2`、`chatglm3`、`gpt-3.5-turbo`
- `--cuda_device`: GPU编号
- `--finetune`: 微调模型名称,为放置于`lora/ptuning`文件夹下的文件夹名
- `--few_shot`: 使用少量数据进行微调(可选)
- `--ntrain`: 少量数据的数量(可选)
- `--cot`: 使用思维链(可选)
- `--finetune_method`: 微调方法,可选`lora`、`ptuning`、`qlora`、`dora`
- `--finetune`: 微调文件夹名称,会自动寻找对应位置
- `--openai_key`: OpenAI API Key, 调用OpenAI API进行评估或评估gpt模型需要使用

@ -4,17 +4,20 @@ import pandas as pd
import torch
from evaluators.chatgpt import ChatGPT_Evaluator
from evaluators.chatglm import ChatGLM_Evaluator
from evaluators.chatglm2 import ChatGLM_Evaluator as ChatGLM2_Evaluator
from evaluators.chatglm3 import ChatGLM_Evaluator as ChatGLM3_Evaluator
from evaluators.chatglm2 import ChatGLM2_Evaluator
from evaluators.chatglm3 import ChatGLM3_Evaluator
import time
from scoring.assessment_engine import AssessmentEngine
choices = ["A", "B", "C", "D"]
device = torch.device("cpu")
def main(args):
global device
evaluator_class = None
if args.cuda_device:
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
device = torch.device("cuda")
@ -28,6 +31,7 @@ def main(args):
elif "chatglm3" in args.model_name:
if args.finetune:
fine_tune_model = args.finetune
evaluator_class = ChatGLM3_Evaluator
else:
fine_tune_model = None
evaluator = ChatGLM3_Evaluator(
@ -41,6 +45,7 @@ def main(args):
elif "chatglm2" in args.model_name:
if args.finetune:
fine_tune_model = args.finetune
evaluator_class = ChatGLM2_Evaluator
else:
fine_tune_model = None
evaluator = ChatGLM2_Evaluator(
@ -54,6 +59,7 @@ def main(args):
elif "chatglm" in args.model_name:
if args.finetune:
fine_tune_model = args.finetune
evaluator_class = ChatGLM_Evaluator
else:
fine_tune_model = None
evaluator = ChatGLM_Evaluator(
@ -76,12 +82,12 @@ def main(args):
else:
fine_tune_model_name = 'original'
save_result_dir = os.path.join(r"logs", f"{args.model_name}_{fine_tune_model_name}/{run_date}")
os.mkdir(save_result_dir)
os.makedirs(save_result_dir)
# subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train']
qa_subject_list = ['car_knowledge', 'car_use', 'car_market']
# qa_subject_list = ['car_use', 'car_market']
subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
# subject_list = ['car_knowledge_in_train', 'car_use_in_train', 'car_market_in_train']
# qa_subject_list = ['car_knowledge', 'car_use', 'car_market']
qa_subject_list = ['car_market']
for subject_name in subject_list:
print("Now testing: " + subject_name)
@ -98,11 +104,34 @@ def main(args):
save_result_dir=save_result_dir)
print("Acc:", correct_ratio)
# result_list = []
#
# for subject_name in qa_subject_list:
# print("Now testing: " + subject_name)
# qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv')
# qa_df = pd.read_csv(qa_file_path)
# evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)
# result_list.append(evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir))
#
# if evaluator_class is not None:
# del evaluator
# evaluator = evaluator_class(
# choices=choices,
# k=args.ntrain,
# model_name=args.model_name,
# device=device
# )
# for index,subject_name in enumerate(qa_subject_list):
# print("Now testing (origin): " + subject_name)
# qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv')
# qa_df = pd.read_csv(qa_file_path)
# origin_result = evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)
# origin_result = origin_result.rename(columns={"model_output": "predict_origin"})
# result_df = result_list[index].rename(columns={"model_output": "predict_finetune"}).join(origin_result["predict_origin"])
# result_file_name = f'{subject_name}_qa_compare_result.csv'
# result_df.to_csv(os.path.join(save_result_dir, result_file_name))
# assessment_engine = AssessmentEngine(save_result_dir, args.openai_key)
# for subject_name in qa_subject_list:
# assessment_engine.eval_result_diff(f'{subject_name}_qa_compare_result.csv')
if __name__ == "__main__":

@ -6,21 +6,14 @@ from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import LogitsProcessorList
from evaluators.evaluator import Evaluator
from evaluators.chatglm_mixin import ChatGLMMixin
class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores
class ChatGLM_Evaluator(Evaluator):
def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None):
class ChatGLM_Evaluator(Evaluator, ChatGLMMixin):
def __init__(self, choices, k, model_name, device='cpu', finetune=None, finetune_method=None):
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
# or directly clone the model
self.finetune_method = finetune_method
self.finetune_name = finetune
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna")
if finetune_method == "ptuning":
CHECKPOINT_PATH = "ptuning/glm1/" + finetune
@ -39,153 +32,4 @@ class ChatGLM_Evaluator(Evaluator):
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
print("Model loaded! (GLM original)")
# self.model = self.model.eval()
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
correct_num = 0
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
history = []
answers = list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot)
if few_shot:
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
response = response.strip()
# For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
# print(response, ans)
# ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
# history=history)
if ans == answers[row_index]:
correct_num += 1
correct = 1
else:
correct = 0
if save_result_dir:
if few_shot:
result.append(response)
score.append(correct)
answer_list.append(ans)
correct_ratio = 100 * correct_num / len(answers)
if save_result_dir:
if few_shot:
test_df['model_output'] = result
test_df['correctness'] = score
test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
if few_shot:
result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv'
test_df.to_csv(os.path.join(save_result_dir, result_file_name))
return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
history = []
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
question = row['question']
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
response = response.strip()
qa_df.loc[row_index, 'model_output'] = response
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = []
k = self.k
if self.k == -1:
k = dev_df.shape[0]
message.append(self.format_example(dev_df.iloc[0, :], cot=cot,
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
for i in range(1, k):
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
return message
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
example = add_prompt + line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
if cot:
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}"
else:
ans = line["answer"]
m = (example, ans)
return m
return example
def extract_cot_answer(self, line, gen_ans):
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
if len(m) > 0 and m[-1] in self.choices:
return m[-1], True
answer_patterns = [
r'([ABCD])是正确的',
r'选项([ABCD])正确',
r'答案为([ABCD])',
r'答案是([ABCD])',
r'答案([ABCD])',
r'选择([ABCD])',
r'答案:([ABCD])',
r'选择答案([ABCD])',
r'正确答案是([ABCD])'
]
# RE extraction
for answer_pattern in answer_patterns:
m = re.search(answer_pattern, gen_ans, re.M)
if m:
answer = m.group(1)
return answer, False
# only containing one choice-character
m = re.findall(r'[ABCD]', gen_ans, re.M)
if len(m) == 1:
answer = m[0]
return answer, False
answer_word_counter = 0
# only containing one choice-context
for c in self.choices:
if str(line[f'{c}']) in gen_ans:
answer = c
answer_word_counter += 1
if answer_word_counter == 1:
return answer, False
return '-', False
def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048,
do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
if history is None:
history = []
if logits_processor is None:
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor())
gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048,
"temperature": temperature, "logits_processor": logits_processor, **kwargs}
if not history:
prompt = query
else:
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs)
score = outputs.scores[0][0].tolist()
choice_score = [score[167], score[333], score[251], score[416]]
ranked_index = [index for index, value in
sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)]
return self.choices[ranked_index[0]]

@ -3,25 +3,15 @@ import re
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import LogitsProcessorList
from evaluators.evaluator import Evaluator
from evaluators.chatglm_mixin import ChatGLMMixin
from peft import PeftModel
class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores
class ChatGLM_Evaluator(Evaluator):
class ChatGLM2_Evaluator(Evaluator, ChatGLMMixin):
def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None):
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
# or directly clone the model
super(ChatGLM2_Evaluator, self).__init__(choices, model_name, k)
self.finetune_method = finetune_method
self.finetune_name = finetune
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna")
if finetune_method == "lora":
self.model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True, mirror="tuna",
@ -47,163 +37,3 @@ class ChatGLM_Evaluator(Evaluator):
resume_download=True).half().to(device)
print("Model loaded!(GLM2)")
# self.model = self.model.eval()
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
correct_num = 0
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
# print(history)
else:
# _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False)
history = [('接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。', '好的,我会尽力解答。')]
# print(history)
answers = list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot)
if few_shot:
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip()
# For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
print(response, ans)
# ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
# history=history)
if ans == answers[row_index]:
correct_num += 1
correct = 1
else:
correct = 0
if save_result_dir:
# if few_shot:
result.append(response)
answer_list.append(ans)
score.append(correct)
correct_ratio = 100 * correct_num / len(answers)
if save_result_dir:
# if few_shot:
test_df['model_output'] = result
test_df['correctness'] = score
test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
if few_shot:
result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv'
test_df.to_csv(os.path.join(save_result_dir, result_file_name))
return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
# history = []
history = [('接下来会给你一些一些汽车领域相关问题,请回答。', '好的,我会尽力解答。')]
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
question = row['question']
response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history)
# current_length = 0
# response = ""
# for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300,
# do_sample=False, history=history):
# print(resp[current_length:], end="", flush=True)
# current_length = len(resp)
# response = resp
# print('')
response = response.strip()
qa_df.loc[row_index, 'model_output'] = response
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = []
k = self.k
if self.k == -1:
k = dev_df.shape[0]
message.append(self.format_example(dev_df.iloc[0, :], cot=cot,
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
for i in range(1, k):
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
return message
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
example = add_prompt + line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
if cot:
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}"
else:
ans = line["answer"]
m = (example, ans)
return m
return example
def extract_cot_answer(self, line, gen_ans):
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
if len(m) > 0 and m[-1] in self.choices:
return m[-1], True
answer_patterns = [
r'([ABCD])是正确的',
r'选项([ABCD])正确',
r'答案为([ABCD])',
r'答案是([ABCD])',
r'答案([ABCD])',
r'选择([ABCD])',
r'答案:([ABCD])',
r'选择答案([ABCD])'
]
# RE extraction
for answer_pattern in answer_patterns:
m = re.search(answer_pattern, gen_ans, re.M)
if m:
answer = m.group(1)
return answer, False
# only containing one choice-character
m = re.findall(r'[ABCD]', gen_ans, re.M)
if len(m) == 1:
answer = m[0]
return answer, False
answer_word_counter = 0
# only containing one choice-context
for c in self.choices:
if str(line[f'{c}']) in gen_ans:
answer = c
answer_word_counter += 1
if answer_word_counter == 1:
return answer, False
return '-', False
def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048,
do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
if history is None:
history = []
if logits_processor is None:
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor())
gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048,
"temperature": temperature, "logits_processor": logits_processor, **kwargs}
if not history:
prompt = query
else:
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs)
score = outputs.scores[0][0].tolist()
choice_score = [score[167], score[333], score[251], score[416]]
ranked_index = [index for index, value in
sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)]
return self.choices[ranked_index[0]]

@ -3,13 +3,11 @@ import re
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import LogitsProcessorList
from evaluators.evaluator import Evaluator
from evaluators.chatglm_mixin import ChatGLMMixin
from pathlib import Path
from typing import Union, Tuple
import typer
from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
from transformers import (
AutoModelForCausalLM,
@ -28,12 +26,12 @@ def _resolve_path(path: Union[str, Path]) -> Path:
return Path(path).expanduser().resolve()
def load_model_and_tokenizer(model_dir: Union[str, Path]) -> Tuple[ModelType, TokenizerType]:
def load_model_and_tokenizer(model_dir: Union[str, Path], device) -> Tuple[ModelType, TokenizerType]:
model_dir = _resolve_path(model_dir)
if (model_dir / 'adapter_config.json').exists():
config = PeftConfig.from_pretrained(model_dir)
config = PeftConfig.from_pretrained(str(model_dir))
base_model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, mirror="tuna",
resume_download=True)
resume_download=True).to(device)
# base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,trust_remote_code=True,
# device_map='auto')
model = PeftModel.from_pretrained(base_model, model_dir)
@ -50,204 +48,19 @@ def load_model_and_tokenizer(model_dir: Union[str, Path]) -> Tuple[ModelType, To
return model, tokenizer
class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores
class ChatGLM_Evaluator(Evaluator):
class ChatGLM3_Evaluator(Evaluator, ChatGLMMixin):
def __init__(self, choices, k, model_name, device, finetune=None, finetune_method=None):
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
# try adding 'mirror="tuna"' and 'resume_download=True' if facing the 'read timed out' problem
# or directly clone the model
super(ChatGLM3_Evaluator, self).__init__(choices, model_name, k)
self.finetune_method = finetune_method
self.finetune_name = finetune
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, mirror="tuna")
if finetune_method == "qlora":
model_dir = 'qlora/glm3/' + finetune
self.model, self.tokenizer = load_model_and_tokenizer(model_dir)
self.model, self.tokenizer = load_model_and_tokenizer(model_dir, device)
self.model = self.model.half().to(device)
print("Model loaded! use GLM3 " + finetune)
else:
self.model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
print("Model loaded! (GLM3)")
# prompt = '以下是中国关于car_knowledge_in_train考试的单项选择题请选出其中的正确答案。\n\n比亚迪的刀片电池采用哪种电池技术\
# nA. 镍氢电池\nB. 锂离子电池\nC. 磷酸铁锂电池\nD. 液态电池\n答案'
# response, history = self.model.chat(self.tokenizer, prompt, max_length=128)
# print(history)
# current_length = 0
# response = ""
# for resp, _ in self.model.stream_chat(self.tokenizer, prompt, max_length=300,
# do_sample=False):
# print(resp[current_length:], end="", flush=True)
# current_length = len(resp)
# response = resp
# print('')
self.model = self.model.eval()
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
correct_num = 0
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
# _ , history = self.model.chat(self.tokenizer, "接下来会提供给你一些选择题,请选出正确的答案。", do_sample=False)
history = [{'role': 'user',
'content': '接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。'},
{'role': 'assistant',
'content': '好的,我会尽力解答。'}]
answers = list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot)
if few_shot:
history_temp = history.copy()
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history_temp)
response = response.strip()
# For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
response, _ = self.model.chat(self.tokenizer, question, max_length=300,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
print(response, ans)
# ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
# history=history)
if ans == answers[row_index]:
correct_num += 1
correct = 1
else:
correct = 0
if save_result_dir:
# if few_shot:
result.append(response)
answer_list.append(ans)
score.append(correct)
correct_ratio = 100 * correct_num / len(answers)
if save_result_dir:
# if few_shot:
test_df['model_output'] = result
test_df['correctness'] = score
test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
if few_shot:
result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv'
test_df.to_csv(os.path.join(save_result_dir, result_file_name))
return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
# history = []
history = [{'role': 'user',
'content': '接下来会给你一些一些汽车领域相关问题,请回答。'},
{'role': 'assistant',
'content': '好的,我会尽力解答。'}]
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
question = row['question']
response, _ = self.model.chat(self.tokenizer, question, max_length=300, do_sample=False, history=history)
response = response.strip()
qa_df.loc[row_index, 'model_output'] = response
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = []
k = self.k
if self.k == -1:
k = dev_df.shape[0]
message.extend(self.format_example(dev_df.iloc[0, :], cot=cot,
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
for i in range(1, k):
message.extend(self.format_example(dev_df.iloc[i, :], cot=cot))
return message
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
example = add_prompt + line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
if cot:
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}"
else:
ans = line["answer"]
m = [{
'role': 'user',
'content': example
}, {
'role': 'assistant',
'content': ans
}]
return m
return example
def extract_cot_answer(self, line, gen_ans):
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
if len(m) > 0 and m[-1] in self.choices:
return m[-1], True
answer_patterns = [
r'([ABCD])是正确的',
r'选项([ABCD])正确',
r'答案为([ABCD])',
r'答案是([ABCD])',
r'答案([ABCD])',
r'选择([ABCD])',
r'答案:([ABCD])',
r'选择答案([ABCD])'
]
# RE extraction
for answer_pattern in answer_patterns:
m = re.search(answer_pattern, gen_ans, re.M)
if m:
answer = m.group(1)
return answer, False
# only containing one choice-character
m = re.findall(r'[ABCD]', gen_ans, re.M)
if len(m) == 1:
answer = m[0]
return answer, False
answer_word_counter = 0
# only containing one choice-context
for c in self.choices:
if str(line[f'{c}']) in gen_ans:
answer = c
answer_word_counter += 1
if answer_word_counter == 1:
return answer, False
return '-', False
def generate_dist(self, model, tokenizer, query, history, num_beams=1, max_length=2048,
do_sample=False, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
if history is None:
history = []
if logits_processor is None:
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor())
gen_kwargs = {"num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, "max_length": 2048,
"temperature": temperature, "logits_processor": logits_processor, **kwargs}
if not history:
prompt = query
else:
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, **gen_kwargs)
score = outputs.scores[0][0].tolist()
choice_score = [score[167], score[333], score[251], score[416]]
ranked_index = [index for index, value in
sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)]
return self.choices[ranked_index[0]]

@ -0,0 +1,175 @@
import os
import re
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import LogitsProcessorList
from evaluators.evaluator import Evaluator
class ChatGLMMixin:
def __init__(self):
self.tokenizer = None
self.model = None
self.model_name = None
self.k = None
self.choices = None
self.finetune_name = None
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, cot=False, save_result_dir=None):
correct_num = 0
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
history = self.generate_zero_shot_prompt(is_choice_question=True)
answers = list(test_df['answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot)
if few_shot:
response, _ = self.model.chat(self.tokenizer, question, max_length=2000,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
response, _ = self.model.chat(self.tokenizer, question, max_length=2000,
do_sample=False, history=history)
response = response.strip()
ans, direct_extract = self.extract_cot_answer(row, response)
if ans == answers[row_index]:
correct_num += 1
correct = 1
else:
correct = 0
if save_result_dir:
result.append(response)
score.append(correct)
answer_list.append(ans)
correct_ratio = 100 * correct_num / len(answers)
if save_result_dir:
test_df['model_output'] = result
test_df['correctness'] = score
test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
if few_shot:
result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv'
test_df.to_csv(os.path.join(save_result_dir, result_file_name))
return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
history = self.generate_zero_shot_prompt(is_choice_question=False)
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
question = row['question']
response, _ = self.model.chat(self.tokenizer, question, max_length=2000,
do_sample=False, history=history)
response = response.strip()
qa_df.loc[row_index, 'model_output'] = response
# current_length = 0
# response = ""
# for resp, _ in self.model.stream_chat(self.tokenizer, question, max_length=300,
# do_sample=False, history=history):
# print(resp[current_length:], end="", flush=True)
# current_length = len(resp)
# response = resp
# print('')
if save_result_dir and self.finetune_name is not None:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
return qa_df
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = []
k = self.k
if self.k == -1:
k = dev_df.shape[0]
init_example = self.format_example(dev_df.iloc[0, :], cot=cot,
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n")
if isinstance(init_example, list):
message.extend(init_example)
else:
message.append(init_example)
for i in range(1, k):
example = self.format_example(dev_df.iloc[i, :], cot=cot)
if isinstance(example, list):
message.extend(example)
else:
message.append(example)
return message
def generate_zero_shot_prompt(self, is_choice_question=True):
if self.model_name == 'chatglm3' and is_choice_question:
return [{'role': 'user',
'content': '接下来会提供给你一些选择题,请选出正确的答案,给出正确的选项即可。'},
{'role': 'assistant',
'content': '好的,我会尽力解答。'}]
elif self.model_name == 'chatglm3' and not is_choice_question:
return [{'role': 'user',
'content': '接下来会给你一些一些汽车领域相关问题,请回答。'},
{'role': 'assistant',
'content': '好的,我会尽力解答。'}]
else:
return []
def format_example(self, line, include_answer=True, cot=False, add_prompt=''):
example = add_prompt + line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
if cot:
ans = "让我们一步一步思考,\n" + line["explanation"] + f"\n所以答案是{line['answer']}"
else:
ans = line["answer"]
if self.model_name == 'chatglm3':
m = [{
'role': 'user',
'content': example
}, {
'role': 'assistant',
'content': ans
}]
else:
m = (example, ans)
return m
return example
def extract_cot_answer(self, line, gen_ans):
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
if len(m) > 0 and m[-1] in self.choices:
return m[-1], True
answer_patterns = [
r'([ABCD])是正确的',
r'选项([ABCD])正确',
r'答案为([ABCD])',
r'答案是([ABCD])',
r'答案([ABCD])',
r'选择([ABCD])',
r'答案:([ABCD])',
r'选择答案([ABCD])',
r'正确答案是([ABCD])'
]
# RE extraction
for answer_pattern in answer_patterns:
m = re.search(answer_pattern, gen_ans, re.M)
if m:
answer = m.group(1)
return answer, False
# only containing one choice-character
m = re.findall(r'[ABCD]', gen_ans, re.M)
if len(m) == 1:
answer = m[0]
return answer, False
answer_word_counter = 0
# only containing one choice-context
for c in self.choices:
if str(line[f'{c}']) in gen_ans:
answer = c
answer_word_counter += 1
if answer_word_counter == 1:
return answer, False
return '-', False

@ -6,6 +6,37 @@ from time import sleep
import re
def extract_ans(response_str):
pattern=[
r"^选([A-D])",
r"^选项([A-D])",
r"答案是\s?选?项?\s?([A-D])",
r"答案为\s?选?项?\s?([A-D])",
r"答案应为\s?选?项?\s?([A-D])",
r"答案选\s?选?项?\s?([A-D])",
r"答案是:\s?选?项?\s?([A-D])",
r"答案应该是:\s?选?项?\s?([A-D])",
r"正确的一项是\s?([A-D])",
r"答案为:\s?选?项?\s?([A-D])",
r"答案应为:\s?选?项?\s?([A-D])",
r"答案:\s?选?项?\s?([A-D])",
r"答案是:\s?选?项?\s?([A-D])",
r"答案应该是:\s?选?项?\s?([A-D])",
r"答案为:\s?选?项?\s?([A-D])",
r"答案应为:\s?选?项?\s?([A-D])",
r"答案:\s?选?项?\s?([A-D])",
]
ans_list=[]
if response_str[0] in ["A",'B','C','D']:
ans_list.append(response_str[0])
for p in pattern:
if len(ans_list)==0:
ans_list=re.findall(p,response_str)
else:
break
return ans_list
class ChatGPT_Evaluator(Evaluator):
def __init__(self, choices, k, api_key,model_name):
super(ChatGPT_Evaluator, self).__init__(choices, model_name, k)
@ -34,6 +65,7 @@ class ChatGPT_Evaluator(Evaluator):
return [
{"role":"user","content":example},
]
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
prompt=[
{
@ -119,7 +151,7 @@ class ChatGPT_Evaluator(Evaluator):
correct=0
else:
if len(response_str)>0:
ans_list=self.extract_ans(response_str)
ans_list= extract_ans(response_str)
if len(ans_list)>0 and (ans_list[-1]==row["answer"]):
correct_num+=1
correct=1
@ -162,7 +194,7 @@ class ChatGPT_Evaluator(Evaluator):
print(msg)
sleep(5)
continue
if response==None:
if response is None:
response_str=""
qa_df.loc[row_index, 'model_output'] = response_str
else:
@ -171,33 +203,3 @@ class ChatGPT_Evaluator(Evaluator):
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def extract_ans(self,response_str):
pattern=[
r"^选([A-D])",
r"^选项([A-D])",
r"答案是\s?选?项?\s?([A-D])",
r"答案为\s?选?项?\s?([A-D])",
r"答案应为\s?选?项?\s?([A-D])",
r"答案选\s?选?项?\s?([A-D])",
r"答案是:\s?选?项?\s?([A-D])",
r"答案应该是:\s?选?项?\s?([A-D])",
r"正确的一项是\s?([A-D])",
r"答案为:\s?选?项?\s?([A-D])",
r"答案应为:\s?选?项?\s?([A-D])",
r"答案:\s?选?项?\s?([A-D])",
r"答案是:\s?选?项?\s?([A-D])",
r"答案应该是:\s?选?项?\s?([A-D])",
r"答案为:\s?选?项?\s?([A-D])",
r"答案应为:\s?选?项?\s?([A-D])",
r"答案:\s?选?项?\s?([A-D])",
]
ans_list=[]
if response_str[0] in ["A",'B','C','D']:
ans_list.append(response_str[0])
for p in pattern:
if len(ans_list)==0:
ans_list=re.findall(p,response_str)
else:
break
return ans_list

@ -7,31 +7,6 @@ class Evaluator:
self.k = k
self.puncs = list(string.punctuation)
def format_example(self, line, include_answer=True):
example = line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
example += f'{line["answer"]}\n\n'
return example
def generate_few_shot_prompt(self, subject, dev_df):
prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"
k = self.k
if self.k == -1:
k = dev_df.shape[0]
for i in range(k):
prompt += self.format_example(dev_df.iloc[i, :])
return prompt
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None):
pass
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
pass
def normalize_answer(self,s):
def white_space_fix(text):

@ -0,0 +1,67 @@
import math
import pandas as pd
from scoring.gpt_scorer import GPTScorer, extract_score
machine_score_df = pd.read_csv('logs/other/20240408181951_result_diff_test_score_82.95347116717225.csv')
gpt_scorer = GPTScorer("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")
finetune_rouge_score_sum = 0
origin_rouge_score_sum = 0
finetune_acc_score_sum = 0
origin_acc_score_sum = 0
finetune_fluency_score_sum = 0
origin_fluency_score_sum = 0
model_better_score_sum = 0
row_count = 0
for row_index, row in machine_score_df.iterrows():
row_count += 1
response_text = row['acc_response_finetune']
print(response_text)
score = extract_score(response_text)
machine_score_df.loc[row_index, 'acc_finetune'] = score
finetune_acc_score_sum += float(score)
response_text = row['acc_response_origin']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'acc_origin'] = score
origin_acc_score_sum += float(score)
response_text = row['fluency_response_finetune']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'fluency_finetune'] = score
finetune_fluency_score_sum += float(score)
response_text = row['fluency_response_origin']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'fluency_origin'] = score
origin_fluency_score_sum += float(score)
response_text = row['diff_score_response']
score = extract_score(response_text)
machine_score_df.loc[row_index, 'diff_score'] = score
model_better_score_sum += float(score)
origin_rouge_1_f_score = row['rouge_score_origin']
origin_rouge_score_sum += origin_rouge_1_f_score
finetune_rouge_1_f_score = row['rouge_score_finetune']
finetune_rouge_score_sum += finetune_rouge_1_f_score
machine_score_df.to_csv('logs/other/re_20240408181951_result_diff_test_score_82.95347116717225.csv', index=False)
# synthesis_rouge_score = finetune_rouge_score_sum / row_count
# original_rouge_score = origin_rouge_score_sum / row_count
# synthesis_acc_score = finetune_acc_score_sum / row_count
# original_acc_score = origin_acc_score_sum / row_count
# synthesis_fluency_score = finetune_fluency_score_sum / row_count
# original_fluency_score = origin_fluency_score_sum / row_count
# synthesis_diff_score = model_better_score_sum / row_count
# print("微调模型ROUGE分数", synthesis_rouge_score)
# print("原模型ROUGE分数", original_rouge_score)
# print("微调模型准确性分数:", synthesis_acc_score)
# print("原模型准确性分数:", original_acc_score)
# print("微调模型流畅度分数:", synthesis_fluency_score)
# print("原模型流畅度分数:", original_fluency_score)
# print("微调模型优于原模型分数:", synthesis_diff_score)
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100 / 4 +
# synthesis_fluency_score * 100 / 3 + synthesis_diff_score * 100 / 3) / 4
# print("综合评分:", synthesis_score)
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100 / 4 +
# original_fluency_score * 100 / 3 + 66) / 4
# print("原模型综合评分:", original_synthesis_score)

@ -11,7 +11,7 @@ class AssessmentEngine:
self.gpt_scorer = GPTScorer(api_key)
def eval_subject(self, subject_name, csv_file_name):
qa_result_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
qa_result_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name)
start_time = time.time()
row_count = 0
rouge_score_sum = 0
@ -33,11 +33,17 @@ class AssessmentEngine:
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
synthesis_score = rouge_score_sum / row_count
qa_result_df.to_csv('logs/' + self.save_result_dir + '/' + subject_name + '_qa_test_score_'
qa_result_df.to_csv(self.save_result_dir + '/' + subject_name + '_qa_test_score_'
+ str(synthesis_score) + '.csv', index=False)
def eval_result_diff(self, csv_file_name):
result_diff_df = pd.read_csv('logs/' + self.save_result_dir + '/' + csv_file_name)
def eval_result_diff(self, csv_file_name, file_type='csv'):
if file_type == 'json':
result_diff_df = pd.read_json(self.save_result_dir + '/' + csv_file_name)
elif file_type == 'csv':
result_diff_df = pd.read_csv(self.save_result_dir + '/' + csv_file_name)
else:
print("Unknown file type:" + file_type)
return
result_diff_df['rouge_score_finetune'] = 0
result_diff_df['rouge_score_origin'] = 0
result_diff_df['acc_finetune'] = 0
@ -118,7 +124,7 @@ class AssessmentEngine:
result_diff_df.loc[row_index, 'diff_score_response'] = gpt_response_diff
if (gpt_score_diff is not None) and gpt_score_diff.isdigit():
model_better_score_sum += float(gpt_score_diff)
result_diff_df.to_csv('logs/' + self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False)
result_diff_df.to_csv(self.save_result_dir + '/result_diff_test_score_tmp.csv', index=False)
end_time = time.time()
elapsed_time = end_time - start_time
@ -145,5 +151,60 @@ class AssessmentEngine:
print("原模型综合评分:", original_synthesis_score)
# 获取当前时间的字符串
current_time = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
result_diff_df.to_csv('logs/' + self.save_result_dir + '/' + current_time + '_result_diff_test_score_'
result_diff_df.to_csv(self.save_result_dir + '/' + current_time + '_result_diff_test_score_'
+ str(synthesis_score) + '.csv', index=False)
def eval_result(self, file_name, file_type='csv'):
start_time = time.time()
if file_type == 'json':
result_df = pd.read_json(self.save_result_dir + '/' + file_name)
elif file_type == 'csv':
result_df = pd.read_csv(self.save_result_dir + '/' + file_name)
else:
print("Unsupported file type:" + file_type)
return
result_df['rouge_score_finetune'] = 0
result_df['acc_finetune'] = 0
result_df['fluency_finetune'] = 0
result_df['acc_response_finetune'] = 0
result_df['fluency_response_finetune'] = 0
rouge_score_sum = 0
acc_score_sum = 0
fluency_score_sum = 0
row_count = 0
for row_index, row in tqdm(result_df.iterrows(), total=len(result_df)):
row_count += 1
test_question = row['question']
model_response = row['Predict']
reference_answer = row['answer']
rouge_score = get_rouge_score(model_response, reference_answer)
rouge_1_f_score = rouge_score['rouge-1']['f']
rouge_score_sum += rouge_1_f_score
result_df.loc[row_index, 'rouge_score_finetune'] = rouge_1_f_score
self.gpt_scorer.mode("accuracy")
gpt_response_acc, gpt_score_acc = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
result_df.loc[row_index, 'acc_finetune'] = gpt_score_acc
result_df.loc[row_index, 'acc_response_finetune'] = gpt_response_acc
if (gpt_score_acc is not None) and gpt_score_acc.isdigit():
acc_score_sum += float(gpt_score_acc)
self.gpt_scorer.mode("fluency")
gpt_response_fluency, gpt_score_fluency = self.gpt_scorer.score_with_chatgpt(test_question,
model_response, reference_answer)
result_df.loc[row_index, 'fluency_finetune'] = gpt_score_fluency
result_df.loc[row_index, 'fluency_response_finetune'] = gpt_response_fluency
if (gpt_score_fluency is not None) and gpt_score_fluency.isdigit():
fluency_score_sum += float(gpt_score_fluency)
result_df.to_csv(self.save_result_dir + '/result_test_score_tmp.csv', index=False)
end_time = time.time()
elapsed_time = end_time - start_time
print("共评估结果" + str(row_count) + "条,总共用时:", elapsed_time, "")
rouge_score = rouge_score_sum / row_count
acc_score = acc_score_sum / row_count
fluency_score = fluency_score_sum / row_count
print("ROUGE分数", rouge_score)
print("准确性分数:", acc_score)
print("流畅度分数:", fluency_score)
synthesis_score = (rouge_score * 100 + acc_score * 100 / 4 + fluency_score * 100 / 3 + 66) / 4
print("综合评分:", synthesis_score)
result_df.to_csv(self.save_result_dir + f'/result_test_score_{synthesis_score}.csv', index=False)

@ -0,0 +1,7 @@
import google.generativeai as genai
genai.configure(api_key='AIzaSyAW_h8itGLwNhYTfx1EDLthhcHHlcIfs7w')
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content("Write a story about a magic backpack.")
print(response.text)

@ -2,6 +2,39 @@ import openai
import re
def extract_score(response_text):
response_text = str(response_text)
# 提取评分
pattern = [
r"^评分为([1-5])分",
r"评分:([1-5])分",
r"评分为([1-5])"
]
score_list = []
for p in pattern:
if len(score_list) == 0:
score_list = re.findall(p, response_text)
else:
break
if len(score_list) == 0:
return '3'
return score_list[0]
def request_gpt(prompt, retries=3):
ordinal = lambda n: str(n) + {1: "st", 2: "nd", 3: "rd"}.get(10 <= n % 100 <= 20 and n or n % 10, "th")
for i in range(retries):
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=prompt,
)
return response.choices[0]['message']['content']
except Exception as e:
print(f"An error occurred while scoring with ChatGPT: {e}, it's the {ordinal(i+1)} time.")
return None
class GPTScorer:
def __init__(self, api_key):
openai.api_key = api_key
@ -17,32 +50,17 @@ class GPTScorer:
def score_with_chatgpt(self, question, model_result, reference, origin_model_result=None):
prompt = self.generate_scoring_prompt(question, model_result, reference, origin_model_result)
try:
# 提交文本以获取ChatGPT评分
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=prompt,
)
# 提取评分
chatgpt_response = response.choices[0]['message']['content']
chatgpt_score = self.extract_score(chatgpt_response)
chatgpt_response = request_gpt(prompt, retries=5)
chatgpt_score = extract_score(chatgpt_response)
return chatgpt_response, chatgpt_score
except Exception as e:
print("An error occurred while scoring with ChatGPT:", e)
print("An error occurred while extract score:", e)
return None, '2'
def generate_scoring_prompt(self, question, model_result, reference, origin_model_result=None):
# 生成评分提示
base_prompt = []
if self.eval_mode == "accuracy":
# base_prompt = [{
# "role": "system",
# "content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
# "请对比参考答案和大模型生成结果从信息准确性的角度评分以下生成的结果以评估其质量。满分为5分。"
# "评分标准为信息准确无误——5分。信息大致符合实际信息——4分。"
# "信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关或回答语句不完整——1分。"
# "可以根据实际情况稍作调整。"
# "回复格式为评分为x分。理由xxx。"
# }]
base_prompt = [{
"role": "system",
"content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
@ -90,23 +108,6 @@ class GPTScorer:
}
]
return prompt
# AIzaSyAW_h8itGLwNhYTfx1EDLthhcHHlcIfs7w google
def extract_score(self, response_text):
# 提取评分
pattern = [
r"^评分为([1-5])分",
r"评分:([1-5])分",
]
score_list = []
for p in pattern:
if len(score_list) == 0:
score_list = re.findall(p, response_text)
else:
break
if len(score_list) == 0:
return '3'
return score_list[0]
# 示例用法
@ -116,17 +117,19 @@ if __name__ == "__main__":
# 初始化模型评分器
scorer = GPTScorer(my_api_key)
# 要评分的大模型结果
sample_question = "秦Plus-DMi车型的安全气囊有哪些类型"
sample_model_result = ("截止到我最后更新知识的时候关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。"
"通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。"
"但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。"
"建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。")
sample_reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。"
print(extract_score('理由参考答案与生成的结果完全一致信息准确无误。因此评分为4分。'))
# 获取ChatGPT评分
response_text, score = scorer.mode('accuracy').score_with_chatgpt(sample_question, sample_model_result, sample_reference)
if response_text is not None:
print("ChatGPT评分:", score, "\nChatGPT回复:", response_text)
else:
print("无法获取ChatGPT评分。")
# 要评分的大模型结果
# sample_question = "秦Plus-DMi车型的安全气囊有哪些类型"
# sample_model_result = ("截止到我最后更新知识的时候关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。"
# "通常来说,汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。"
# "但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。"
# "建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。")
# sample_reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。"
#
# # 获取ChatGPT评分
# response_text, score = scorer.mode('accuracy').score_with_chatgpt(sample_question, sample_model_result, sample_reference)
# if response_text is not None:
# print("ChatGPT评分:", score, "\nChatGPT回复:", response_text)
# else:
# print("无法获取ChatGPT评分。")

@ -1,28 +1,37 @@
import argparse
from scoring.assessment_engine import AssessmentEngine
assessment_engine = AssessmentEngine("other",
"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")
assessment_engine.eval_result_diff("0408output-dora.csv")
def main(args):
assessment_engine = AssessmentEngine("logs/other",args.openai_key)
assessment_engine.eval_result_diff("0319output.csv")
assessment_engine.eval_result("output-pt-sft.json", "json")
# synthesis_rouge_score = 0.30358589506467687
# print("微调模型ROUGE分数", synthesis_rouge_score)
# original_rouge_score = 0.26004000118452175
# print("原模型ROUGE分数", original_rouge_score)
# synthesis_acc_score = 2.768
# print("微调模型准确性分数:", synthesis_acc_score)
# original_acc_score = 2.724
# print("原模型准确性分数:", original_acc_score)
# synthesis_fluency_score = 2.098
# print("微调模型流畅度分数:", synthesis_fluency_score)
# original_fluency_score = 2.236
# print("原模型流畅度分数:", original_fluency_score)
# synthesis_diff_score = 2.278
# print("微调模型优于原模型分数:", synthesis_diff_score)
#
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100/4 + synthesis_fluency_score * 100/3
# + synthesis_diff_score * 100/3 ) / 4
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100/4 +
# original_fluency_score * 100/3 + 66 ) / 4
#
# print("综合评分:", synthesis_score)
# print("原模型综合评分:", original_synthesis_score)
# synthesis_rouge_score = 0.30358589506467687
# print("微调模型ROUGE分数", synthesis_rouge_score)
# original_rouge_score = 0.26004000118452175
# print("原模型ROUGE分数", original_rouge_score)
# synthesis_acc_score = 2.768
# print("微调模型准确性分数:", synthesis_acc_score)
# original_acc_score = 2.724
# print("原模型准确性分数:", original_acc_score)
# synthesis_fluency_score = 2.098
# print("微调模型流畅度分数:", synthesis_fluency_score)
# original_fluency_score = 2.236
# print("原模型流畅度分数:", original_fluency_score)
# synthesis_diff_score = 2.278
# print("微调模型优于原模型分数:", synthesis_diff_score)
#
# synthesis_score = (synthesis_rouge_score * 100 + synthesis_acc_score * 100/4 + synthesis_fluency_score * 100/3
# + synthesis_diff_score * 100/3 ) / 4
# original_synthesis_score = (original_rouge_score * 100 + original_acc_score * 100/4 +
# original_fluency_score * 100/3 + 66 ) / 4
#
# print("综合评分:", synthesis_score)
# print("原模型综合评分:", original_synthesis_score)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--openai_key", type=str, default="xxx")
user_args = parser.parse_args()
main(user_args)

Loading…
Cancel
Save