增加大模型评分模块以及问答数据集处理模块(半成品)。

main
PeterAlbus 9 months ago
parent b3f8e768ff
commit c06f9a3684

@ -9,7 +9,7 @@ A simple program to evaluate large language model.
- transformers 4.33.2
- accelerate 0.26.1
- tqdm 4.66.1
- openai 1.10.0
- openai 0.28
## 需求其余文件
@ -18,6 +18,7 @@ A simple program to evaluate large language model.
- 微调后的lora模型可放置于`./lora`文件夹下可应用于ChatGLM2
- 微调后的ptuning模型可放置于`./ptuning`文件夹下可应用于ChatGLM
- 训练数据按照C-Eval格式放置于`./data`文件夹下,文件命名和`eval.py`中的`subject_name`相关
- 相较于C-Eval的数据集代码添加了'qa'的数据集,放置于`./data/qa`文件夹下,为非选择题的问答数据集。
## Run

@ -15,7 +15,7 @@ def main(args):
if "turbo" in args.model_name or "gpt-4" in args.model_name:
# print("Not supported yet")
# return -1
evaluator=ChatGPT_Evaluator(
evaluator = ChatGPT_Evaluator(
choices=choices,
k=args.ntrain,
api_key=args.openai_key,
@ -66,9 +66,10 @@ def main(args):
os.mkdir(save_result_dir)
subject_list = ['computer_architecture', 'car_knowledge', 'car_use', 'car_market']
qa_subject_list = ['car_knowledge', 'car_use', 'car_market']
for subject_name in subject_list:
print(subject_name)
print("Now testing: " + subject_name)
# subject_name=args.subject
val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
val_df = pd.read_csv(val_file_path)
@ -82,6 +83,12 @@ def main(args):
save_result_dir=save_result_dir)
print("Acc:", correct_ratio)
for subject_name in qa_subject_list:
print("Now testing: " + subject_name)
qa_file_path = os.path.join('data/qa', f'{subject_name}_qa.csv')
qa_df = pd.read_csv(qa_file_path)
evaluator.eval_qa(subject_name, qa_df, save_result_dir=save_result_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()

@ -7,6 +7,7 @@ from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import LogitsProcessorList
from evaluators.evaluator import Evaluator
class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
@ -14,6 +15,7 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
scores[..., 5] = 5e4
return scores
class ChatGLM_Evaluator(Evaluator):
def __init__(self, choices, k, model_name, device, finetune=None):
super(ChatGLM_Evaluator, self).__init__(choices, model_name, k)
@ -21,7 +23,7 @@ class ChatGLM_Evaluator(Evaluator):
# or directly clone the model
self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna")
if finetune:
CHECKPOINT_PATH="ptuning/" + finetune
CHECKPOINT_PATH = "ptuning/" + finetune
config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=128)
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
@ -34,7 +36,8 @@ class ChatGLM_Evaluator(Evaluator):
self.model.transformer.prefix_encoder.float()
print("Model loaded! use GLM + " + finetune)
else:
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna", resume_download=True).half().to(device)
self.model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, mirror="tuna",
resume_download=True).half().to(device)
print("Model loaded!(GLM)")
# self.model = self.model.eval()
@ -45,6 +48,7 @@ class ChatGLM_Evaluator(Evaluator):
if few_shot:
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
@ -57,8 +61,9 @@ class ChatGLM_Evaluator(Evaluator):
response = response.strip()
# For ChatGLM, we use answer extraction in answer-only mode too.
ans, direct_extract = self.extract_cot_answer(row, response)
else: # zero-shot by extracting answer from distribution
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048, history=history)
else: # zero-shot by extracting answer from distribution
ans = self.generate_dist(self.model, self.tokenizer, question, do_sample=False, max_length=2048,
history=history)
if ans == answers[row_index]:
correct_num += 1
correct = 1
@ -68,12 +73,14 @@ class ChatGLM_Evaluator(Evaluator):
if few_shot:
result.append(response)
score.append(correct)
correct_ratio = 100*correct_num/len(answers)
answer_list.append(ans)
correct_ratio = 100 * correct_num / len(answers)
if save_result_dir:
if few_shot:
test_df['model_output'] = result
test_df['correctness'] = score
test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
if few_shot:
result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv'
@ -81,12 +88,24 @@ class ChatGLM_Evaluator(Evaluator):
return correct_ratio
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
history = []
for row_index, row in tqdm(qa_df.iterrows(), total=len(qa_df)):
question = row['question']
response, _ = self.model.chat(self.tokenizer, question, do_sample=False, history=history)
response = response.strip()
qa_df.loc[row_index, 'model_output'] = response
if save_result_dir:
result_file_name = f'{subject_name}_qa_test_result.csv'
qa_df.to_csv(os.path.join(save_result_dir, result_file_name))
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
message = []
k = self.k
if self.k == -1:
k = dev_df.shape[0]
message.append(self.format_example(dev_df.iloc[0, :], cot=cot, add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
message.append(self.format_example(dev_df.iloc[0, :], cot=cot,
add_prompt=f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"))
for i in range(1, k):
message.append(self.format_example(dev_df.iloc[i, :], cot=cot))
return message
@ -164,5 +183,6 @@ class ChatGLM_Evaluator(Evaluator):
score = outputs.scores[0][0].tolist()
choice_score = [score[167], score[333], score[251], score[416]]
ranked_index = [index for index, value in sorted(list(enumerate(choice_score)), key=lambda x:x[1], reverse=True)]
ranked_index = [index for index, value in
sorted(list(enumerate(choice_score)), key=lambda x: x[1], reverse=True)]
return self.choices[ranked_index[0]]

@ -36,6 +36,7 @@ class ChatGLM_Evaluator(Evaluator):
if few_shot:
result = []
score = []
answer_list = []
if few_shot:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
@ -58,6 +59,7 @@ class ChatGLM_Evaluator(Evaluator):
if save_result_dir:
if few_shot:
result.append(response)
answer_list.append(ans)
score.append(correct)
correct_ratio = 100*correct_num/len(answers)
@ -65,6 +67,7 @@ class ChatGLM_Evaluator(Evaluator):
if few_shot:
test_df['model_output'] = result
test_df['correctness'] = score
test_df['model_answer'] = answer_list
result_file_name = f'{subject_name}_{correct_ratio}_test.csv'
if few_shot:
result_file_name = f'{subject_name}_{correct_ratio}_few_shot_test.csv'

@ -29,6 +29,9 @@ class Evaluator:
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None):
pass
def eval_qa(self, subject_name, qa_df, save_result_dir=None):
pass
def normalize_answer(self,s):
def white_space_fix(text):

@ -0,0 +1,77 @@
import openai
import re
class ModelScorer:
def __init__(self, api_key):
openai.api_key = api_key
self.eval_mode = "accuracy"
def score_with_chatgpt(self, text):
try:
# 提交文本以获取ChatGPT评分
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=text,
)
# 提取评分
chatgpt_response = response.choices[0]['message']['content']
chatgpt_score = self.extract_score(chatgpt_response)
return chatgpt_response,chatgpt_score
except Exception as e:
print("An error occurred while scoring with ChatGPT:", e)
return None
def generate_scoring_prompt(self, question, model_result, reference):
# 生成评分提示
base_prompt = []
if self.eval_mode == "accuracy":
base_prompt = [{
"role": "system",
"content": "你是一个汽车领域专家,接下来将向你提供一个问题、一个参考答案和一个大模型生成的结果。"
"请对比参考答案和大模型生成结果从信息准确性的角度评分以下生成的结果以评估其质量。满分为5分。"
"评分标准为信息准确无误——5分。信息大致符合实际信息——4分。"
"信息不全面但明确表达了自身无法回答——3分。信息完全错误——2分。回答无关——1分。"
"可以根据实际情况稍作调整。"
"回复格式为评分为x分。理由xxx。"
}]
prompt = base_prompt + [
{
"role": "user",
"content": f"问题:{question}\n\n生成的结果:{model_result}\n\n参考答案:{reference}"
}
]
return prompt
def extract_score(self, response_text):
# 提取评分
pattern=[
r"^评分为([1-5])分",
]
score_list=[]
for p in pattern:
if len(score_list)==0:
score_list=re.findall(p,response_text)
else:
break
return score_list[0]
# 示例用法
if __name__ == "__main__":
my_api_key = "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"
# 初始化模型评分器
scorer = ModelScorer(my_api_key)
# 要评分的大模型结果
question = "秦Plus-DMi车型的安全气囊有哪些类型"
model_result = "截止到我最后更新知识的时候关于秦Plus-DMi车型的具体安全气囊类型的信息我并没有。通常来说汽车的安全气囊系统可能包括驾驶员气囊、副驾驶气囊、侧面气囊、头部气囊等。但具体车型的安全气囊配置可能会因地区、年份和车型的不同而有所差异。建议您直接查询该车型的官方资料或者联系经销商以获取最准确的信息。"
reference = "秦Plus-DMi配备有驾驶员安全气囊、前排乘员安全气囊、侧帘式安全气囊和座椅侧安全气囊。"
prompt = scorer.generate_scoring_prompt(question, model_result, reference)
# 获取ChatGPT评分
response,score = scorer.score_with_chatgpt(prompt)
if response is not None:
print("ChatGPT评分:", score, "\nChatGPT回复:", response)
else:
print("无法获取ChatGPT评分。")

@ -14,4 +14,4 @@ def get_rouge_score(s1, s2):
if __name__ == "__main__":
print('hello')
print(get_rouge_score("I love you", "I like you"))
print(get_rouge_score("比亚迪秦PLUS-DMi是一款混合动力汽车。", "比亚迪秦PLUS-DMi是一款混动车。"))

Loading…
Cancel
Save